diff --git a/changelog.md b/changelog.md index 86a4476..0c68a36 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2025-11-04 - 1.8.3 - fix(context) +Prevent enormous git diffs and OOM during context building by adding exclusion patterns, truncation, and diagnostic logging + +- Add comprehensive git diff exclusion globs (locks, build artifacts, maps, bundles, IDE folders, logs, caches) when collecting uncommitted diffs to avoid noisy/huge diffs +- Pass glob patterns directly to smartgit.getUncommittedDiff for efficient server-side matching +- Emit diagnostic statistics for diffs (files changed, total characters, estimated tokens, number of exclusion patterns) and warn on unusually large diffs +- Introduce pre-tokenization safety checks in iterative context builder: truncate raw diff text if it exceeds MAX_DIFF_CHARS and throw a clear error if token count still exceeds MAX_DIFF_TOKENS +- Format and log token counts using locale-aware formatting for clarity +- Improve robustness of commit context generation to reduce risk of OOM / model-limit overruns + ## 2025-11-03 - 1.8.0 - feat(context) Wire OpenAI provider through task context factory and add git-diff support to iterative context builder diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 04ac0e8..8339621 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -3799,8 +3799,8 @@ packages: resolution: {integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==} engines: {node: '>=4'} - minimatch@10.0.3: - resolution: {integrity: sha512-IPZ167aShDZZUMdRk66cyQAW3qr0WzbHkPdMYa8bzZhlHhO3jALbKdxcaak7W9FfT2rZNpQuUu4Od7ILEpXSaw==} + minimatch@10.1.1: + resolution: {integrity: sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==} engines: {node: 20 || >=22} minimatch@3.1.2: @@ -9797,7 +9797,7 @@ snapshots: dependencies: foreground-child: 3.3.1 jackspeak: 4.1.1 - minimatch: 10.0.3 + minimatch: 10.1.1 minipass: 7.1.2 package-json-from-dist: 1.0.1 path-scurry: 2.0.0 @@ -10680,7 +10680,7 @@ snapshots: min-indent@1.0.1: {} - minimatch@10.0.3: + minimatch@10.1.1: dependencies: '@isaacs/brace-expansion': 5.0.0 diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 380d5a5..fa3951a 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@git.zone/tsdoc', - version: '1.8.0', + version: '1.8.3', description: 'A comprehensive TypeScript documentation tool that leverages AI to generate and enhance project documentation, including dynamic README creation, API docs via TypeDoc, and smart commit message generation.' } diff --git a/ts/aidocs_classes/commit.ts b/ts/aidocs_classes/commit.ts index b1c3804..45649f3 100644 --- a/ts/aidocs_classes/commit.ts +++ b/ts/aidocs_classes/commit.ts @@ -27,18 +27,71 @@ export class Commit { smartgitInstance, this.projectDir ); - const diffStringArray = await gitRepo.getUncommittedDiff([ + + // Define comprehensive exclusion patterns + // smartgit@3.3.0+ supports glob patterns natively + const excludePatterns = [ + // Lock files 'pnpm-lock.yaml', 'package-lock.json', 'npm-shrinkwrap.json', 'yarn.lock', 'deno.lock', 'bun.lockb', - '.claude/*', - '.cursor/*', - '.vscode/*', - '.idea/*', - ]); + + // Build artifacts (main culprit for large diffs!) + 'dist/**', + 'dist_*/**', // dist_ts, dist_web, etc. + 'build/**', + '.next/**', + 'out/**', + 'public/dist/**', + + // Compiled/bundled files + '**/*.js.map', + '**/*.d.ts.map', + '**/*.min.js', + '**/*.bundle.js', + '**/*.chunk.js', + + // IDE/Editor directories + '.claude/**', + '.cursor/**', + '.vscode/**', + '.idea/**', + '**/*.swp', + '**/*.swo', + + // Logs and caches + '.nogit/**', + '**/*.log', + '.cache/**', + '.rpt2_cache/**', + 'coverage/**', + '.nyc_output/**', + ]; + + // Pass glob patterns directly to smartgit - it handles matching internally + const diffStringArray = await gitRepo.getUncommittedDiff(excludePatterns); + + // Diagnostic logging for diff statistics + if (diffStringArray.length > 0) { + const totalChars = diffStringArray.join('\n\n').length; + const estimatedTokens = Math.ceil(totalChars / 4); + + console.log(`📊 Git diff statistics:`); + console.log(` Files changed: ${diffStringArray.length}`); + console.log(` Total characters: ${totalChars.toLocaleString()}`); + console.log(` Estimated tokens: ${estimatedTokens.toLocaleString()}`); + console.log(` Exclusion patterns: ${excludePatterns.length}`); + + if (estimatedTokens > 50000) { + console.warn(`⚠️ WARNING: Unusually large diff (${estimatedTokens.toLocaleString()} tokens)`); + console.warn(` This may indicate build artifacts or large files in the diff.`); + console.warn(` Consider reviewing uncommitted changes or improving exclusion patterns.`); + } + } + // Use the new TaskContextFactory for optimized context const taskContextFactory = new (await import('../context/index.js')).TaskContextFactory( this.projectDir, diff --git a/ts/context/iterative-context-builder.ts b/ts/context/iterative-context-builder.ts index 06874ec..27ddb8f 100644 --- a/ts/context/iterative-context-builder.ts +++ b/ts/context/iterative-context-builder.ts @@ -115,6 +115,22 @@ export class IterativeContextBuilder { // If additional context (e.g., git diff) is provided, prepend it if (additionalContext) { + // CRITICAL SAFETY: Check raw string size BEFORE tokenization to prevent OOM + const MAX_DIFF_CHARS = 500000; // ~125k tokens max (conservative 4 chars/token ratio) + const MAX_DIFF_TOKENS = 150000; // Hard token limit for safety + + // First check: raw character count + if (additionalContext.length > MAX_DIFF_CHARS) { + const originalSize = additionalContext.length; + logger.log('warn', `⚠️ Git diff too large (${originalSize.toLocaleString()} chars > ${MAX_DIFF_CHARS.toLocaleString()} limit)`); + logger.log('warn', ` This likely includes build artifacts (dist/, *.js.map, bundles, etc.)`); + logger.log('warn', ` Truncating to first ${MAX_DIFF_CHARS.toLocaleString()} characters.`); + logger.log('warn', ` Consider: git stash build files, improve .gitignore, or review uncommitted changes.`); + + additionalContext = additionalContext.substring(0, MAX_DIFF_CHARS) + + '\n\n[... DIFF TRUNCATED - exceeded size limit of ' + MAX_DIFF_CHARS.toLocaleString() + ' chars ...]'; + } + const diffSection = ` ====== GIT DIFF ====== @@ -122,10 +138,22 @@ ${additionalContext} ====== END OF GIT DIFF ====== `; - loadedContent = diffSection; + + // Second check: actual token count after truncation const diffTokens = this.countTokens(diffSection); + + if (diffTokens > MAX_DIFF_TOKENS) { + logger.log('error', `❌ Git diff still too large after truncation (${diffTokens.toLocaleString()} tokens > ${MAX_DIFF_TOKENS.toLocaleString()} limit)`); + throw new Error( + `Git diff size (${diffTokens.toLocaleString()} tokens) exceeds maximum (${MAX_DIFF_TOKENS.toLocaleString()} tokens). ` + + `This indicates massive uncommitted changes, likely build artifacts. ` + + `Please commit or stash dist/, build/, or other generated files.` + ); + } + + loadedContent = diffSection; totalTokensUsed += diffTokens; - logger.log('info', `📝 Added git diff to context (${diffTokens} tokens)`); + logger.log('info', `📝 Added git diff to context (${diffTokens.toLocaleString()} tokens)`); } // Phase 3: Iterative file selection and loading