/** * Intelligent git diff processor that handles large diffs by sampling and prioritization * instead of blind truncation. */ export interface IDiffFileInfo { filepath: string; status: 'added' | 'modified' | 'deleted'; linesAdded: number; linesRemoved: number; totalLines: number; estimatedTokens: number; diffContent: string; } export interface IProcessedDiff { summary: string; // Human-readable overview fullDiffs: string[]; // Small files included fully summarizedDiffs: string[]; // Medium files with head/tail metadataOnly: string[]; // Large files, just stats totalFiles: number; totalTokens: number; } export interface IDiffProcessorOptions { maxDiffTokens?: number; // Maximum tokens for entire diff section (default: 100000) smallFileLines?: number; // Files <= this are included fully (default: 50) mediumFileLines?: number; // Files <= this are summarized (default: 200) sampleHeadLines?: number; // Lines to show at start of medium files (default: 20) sampleTailLines?: number; // Lines to show at end of medium files (default: 20) } export class DiffProcessor { private options: Required; constructor(options: IDiffProcessorOptions = {}) { this.options = { maxDiffTokens: options.maxDiffTokens ?? 100000, smallFileLines: options.smallFileLines ?? 50, mediumFileLines: options.mediumFileLines ?? 200, sampleHeadLines: options.sampleHeadLines ?? 20, sampleTailLines: options.sampleTailLines ?? 20, }; } /** * Process an array of git diffs into a structured, token-efficient format */ public processDiffs(diffStringArray: string[]): IProcessedDiff { // Parse all diffs into file info objects const fileInfos: IDiffFileInfo[] = diffStringArray .map(diffString => this.parseDiffFile(diffString)) .filter(info => info !== null) as IDiffFileInfo[]; // Prioritize files (source files first, build artifacts last) const prioritized = this.prioritizeFiles(fileInfos); const result: IProcessedDiff = { summary: '', fullDiffs: [], summarizedDiffs: [], metadataOnly: [], totalFiles: prioritized.length, totalTokens: 0, }; let tokensUsed = 0; const tokenBudget = this.options.maxDiffTokens; // Categorize and include files based on size and token budget for (const fileInfo of prioritized) { const remainingBudget = tokenBudget - tokensUsed; if (remainingBudget <= 0) { // Budget exhausted - rest are metadata only result.metadataOnly.push(this.formatMetadataOnly(fileInfo)); continue; } if (fileInfo.totalLines <= this.options.smallFileLines) { // Small file - include fully if budget allows if (fileInfo.estimatedTokens <= remainingBudget) { const statusPrefix = this.getFileStatusPrefix(fileInfo); result.fullDiffs.push(`${statusPrefix}${fileInfo.diffContent}`); tokensUsed += fileInfo.estimatedTokens; } else { result.metadataOnly.push(this.formatMetadataOnly(fileInfo)); } } else if (fileInfo.totalLines <= this.options.mediumFileLines) { // Medium file - try to include summary with head/tail const summary = this.extractDiffSample( fileInfo, this.options.sampleHeadLines, this.options.sampleTailLines ); const summaryTokens = Math.ceil(summary.length / 4); // Rough estimate if (summaryTokens <= remainingBudget) { result.summarizedDiffs.push(summary); tokensUsed += summaryTokens; } else { result.metadataOnly.push(this.formatMetadataOnly(fileInfo)); } } else { // Large file - metadata only result.metadataOnly.push(this.formatMetadataOnly(fileInfo)); } } result.totalTokens = tokensUsed; result.summary = this.generateSummary(result); return result; } /** * Format the processed diff for inclusion in context */ public formatForContext(processed: IProcessedDiff): string { const sections: string[] = []; // Summary section sections.push('====== GIT DIFF SUMMARY ======'); sections.push(processed.summary); sections.push(''); // Full diffs section if (processed.fullDiffs.length > 0) { sections.push(`====== FULL DIFFS (${processed.fullDiffs.length} files) ======`); sections.push(processed.fullDiffs.join('\n\n')); sections.push(''); } // Summarized diffs section if (processed.summarizedDiffs.length > 0) { sections.push(`====== SUMMARIZED DIFFS (${processed.summarizedDiffs.length} files) ======`); sections.push(processed.summarizedDiffs.join('\n\n')); sections.push(''); } // Metadata only section if (processed.metadataOnly.length > 0) { sections.push(`====== METADATA ONLY (${processed.metadataOnly.length} files) ======`); sections.push(processed.metadataOnly.join('\n')); sections.push(''); } sections.push('====== END OF GIT DIFF ======'); return sections.join('\n'); } /** * Parse a single git diff string into file information */ private parseDiffFile(diffString: string): IDiffFileInfo | null { if (!diffString || diffString.trim().length === 0) { return null; } const lines = diffString.split('\n'); let filepath = ''; let status: 'added' | 'modified' | 'deleted' = 'modified'; let linesAdded = 0; let linesRemoved = 0; // Parse diff header to extract filepath and status for (const line of lines) { if (line.startsWith('--- a/')) { filepath = line.substring(6); } else if (line.startsWith('+++ b/')) { const newPath = line.substring(6); if (newPath === '/dev/null') { status = 'deleted'; } else if (filepath === '/dev/null') { status = 'added'; filepath = newPath; } else { filepath = newPath; } } else if (line.startsWith('+') && !line.startsWith('+++')) { linesAdded++; } else if (line.startsWith('-') && !line.startsWith('---')) { linesRemoved++; } } const totalLines = linesAdded + linesRemoved; const estimatedTokens = Math.ceil(diffString.length / 4); return { filepath, status, linesAdded, linesRemoved, totalLines, estimatedTokens, diffContent: diffString, }; } /** * Prioritize files by importance (source files before build artifacts) */ private prioritizeFiles(files: IDiffFileInfo[]): IDiffFileInfo[] { return files.sort((a, b) => { const scoreA = this.getFileImportanceScore(a.filepath); const scoreB = this.getFileImportanceScore(b.filepath); return scoreB - scoreA; // Higher score first }); } /** * Calculate importance score for a file path */ private getFileImportanceScore(filepath: string): number { // Source files - highest priority if (filepath.match(/^(src|lib|app|components|pages|api)\//)) { return 100; } // Test files - high priority if (filepath.match(/\.(test|spec)\.(ts|js|tsx|jsx)$/) || filepath.startsWith('test/')) { return 80; } // Configuration files - medium-high priority if (filepath.match(/\.(json|yaml|yml|toml|config\.(ts|js))$/)) { return 60; } // Documentation - medium priority if (filepath.match(/\.(md|txt|rst)$/)) { return 40; } // Build artifacts - low priority if (filepath.match(/^(dist|build|out|\.next|public\/dist)\//)) { return 10; } // Everything else - default priority return 50; } /** * Extract head and tail lines from a diff, omitting the middle */ private extractDiffSample(fileInfo: IDiffFileInfo, headLines: number, tailLines: number): string { const lines = fileInfo.diffContent.split('\n'); const totalLines = lines.length; if (totalLines <= headLines + tailLines) { // File is small enough to include fully return fileInfo.diffContent; } // Extract file metadata from diff header const headerLines: string[] = []; let bodyStartIndex = 0; for (let i = 0; i < lines.length; i++) { if (lines[i].startsWith('@@')) { headerLines.push(...lines.slice(0, i + 1)); bodyStartIndex = i + 1; break; } } const bodyLines = lines.slice(bodyStartIndex); const head = bodyLines.slice(0, headLines); const tail = bodyLines.slice(-tailLines); const omittedLines = bodyLines.length - headLines - tailLines; const statusEmoji = fileInfo.status === 'added' ? '➕' : fileInfo.status === 'deleted' ? '➖' : '📝'; const parts: string[] = []; parts.push(`${statusEmoji} FILE: ${fileInfo.filepath}`); parts.push(`CHANGES: +${fileInfo.linesAdded} lines, -${fileInfo.linesRemoved} lines (${fileInfo.totalLines} total)`); parts.push(''); parts.push(...headerLines); parts.push(...head); parts.push(''); parts.push(`[... ${omittedLines} lines omitted - use Read tool to see full file ...]`); parts.push(''); parts.push(...tail); return parts.join('\n'); } /** * Get file status prefix with emoji */ private getFileStatusPrefix(fileInfo: IDiffFileInfo): string { const statusEmoji = fileInfo.status === 'added' ? '➕' : fileInfo.status === 'deleted' ? '➖' : '📝'; return `${statusEmoji} `; } /** * Extract filepath from diff content */ private extractFilepathFromDiff(diffContent: string): string { const lines = diffContent.split('\n'); for (const line of lines) { if (line.startsWith('+++ b/')) { return line.substring(6); } } return 'unknown'; } /** * Format file info as metadata only */ private formatMetadataOnly(fileInfo: IDiffFileInfo): string { const statusEmoji = fileInfo.status === 'added' ? '➕' : fileInfo.status === 'deleted' ? '➖' : '📝'; return `${statusEmoji} ${fileInfo.filepath} (+${fileInfo.linesAdded}, -${fileInfo.linesRemoved})`; } /** * Generate human-readable summary of processed diff */ private generateSummary(result: IProcessedDiff): string { const parts: string[] = []; parts.push(`Files changed: ${result.totalFiles} total`); parts.push(`- ${result.fullDiffs.length} included in full`); parts.push(`- ${result.summarizedDiffs.length} summarized (head/tail shown)`); parts.push(`- ${result.metadataOnly.length} metadata only`); parts.push(`Estimated tokens: ~${result.totalTokens.toLocaleString()}`); if (result.metadataOnly.length > 0) { parts.push(''); parts.push('NOTE: Some files excluded to stay within token budget.'); parts.push('Use Read tool with specific file paths to see full content.'); } return parts.join('\n'); } }