feat(context): Add intelligent DiffProcessor to summarize and prioritize git diffs and integrate it into the commit context pipeline

This commit is contained in:
2025-11-04 02:19:57 +00:00
parent f84a65217d
commit 4bf0c02618
7 changed files with 698 additions and 13 deletions

View File

@@ -0,0 +1,341 @@
/**
* Intelligent git diff processor that handles large diffs by sampling and prioritization
* instead of blind truncation.
*/
export interface IDiffFileInfo {
filepath: string;
status: 'added' | 'modified' | 'deleted';
linesAdded: number;
linesRemoved: number;
totalLines: number;
estimatedTokens: number;
diffContent: string;
}
export interface IProcessedDiff {
summary: string; // Human-readable overview
fullDiffs: string[]; // Small files included fully
summarizedDiffs: string[]; // Medium files with head/tail
metadataOnly: string[]; // Large files, just stats
totalFiles: number;
totalTokens: number;
}
export interface IDiffProcessorOptions {
maxDiffTokens?: number; // Maximum tokens for entire diff section (default: 100000)
smallFileLines?: number; // Files <= this are included fully (default: 50)
mediumFileLines?: number; // Files <= this are summarized (default: 200)
sampleHeadLines?: number; // Lines to show at start of medium files (default: 20)
sampleTailLines?: number; // Lines to show at end of medium files (default: 20)
}
export class DiffProcessor {
private options: Required<IDiffProcessorOptions>;
constructor(options: IDiffProcessorOptions = {}) {
this.options = {
maxDiffTokens: options.maxDiffTokens ?? 100000,
smallFileLines: options.smallFileLines ?? 50,
mediumFileLines: options.mediumFileLines ?? 200,
sampleHeadLines: options.sampleHeadLines ?? 20,
sampleTailLines: options.sampleTailLines ?? 20,
};
}
/**
* Process an array of git diffs into a structured, token-efficient format
*/
public processDiffs(diffStringArray: string[]): IProcessedDiff {
// Parse all diffs into file info objects
const fileInfos: IDiffFileInfo[] = diffStringArray
.map(diffString => this.parseDiffFile(diffString))
.filter(info => info !== null) as IDiffFileInfo[];
// Prioritize files (source files first, build artifacts last)
const prioritized = this.prioritizeFiles(fileInfos);
const result: IProcessedDiff = {
summary: '',
fullDiffs: [],
summarizedDiffs: [],
metadataOnly: [],
totalFiles: prioritized.length,
totalTokens: 0,
};
let tokensUsed = 0;
const tokenBudget = this.options.maxDiffTokens;
// Categorize and include files based on size and token budget
for (const fileInfo of prioritized) {
const remainingBudget = tokenBudget - tokensUsed;
if (remainingBudget <= 0) {
// Budget exhausted - rest are metadata only
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
continue;
}
if (fileInfo.totalLines <= this.options.smallFileLines) {
// Small file - include fully if budget allows
if (fileInfo.estimatedTokens <= remainingBudget) {
const statusPrefix = this.getFileStatusPrefix(fileInfo);
result.fullDiffs.push(`${statusPrefix}${fileInfo.diffContent}`);
tokensUsed += fileInfo.estimatedTokens;
} else {
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
} else if (fileInfo.totalLines <= this.options.mediumFileLines) {
// Medium file - try to include summary with head/tail
const summary = this.extractDiffSample(
fileInfo,
this.options.sampleHeadLines,
this.options.sampleTailLines
);
const summaryTokens = Math.ceil(summary.length / 4); // Rough estimate
if (summaryTokens <= remainingBudget) {
result.summarizedDiffs.push(summary);
tokensUsed += summaryTokens;
} else {
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
} else {
// Large file - metadata only
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
}
result.totalTokens = tokensUsed;
result.summary = this.generateSummary(result);
return result;
}
/**
* Format the processed diff for inclusion in context
*/
public formatForContext(processed: IProcessedDiff): string {
const sections: string[] = [];
// Summary section
sections.push('====== GIT DIFF SUMMARY ======');
sections.push(processed.summary);
sections.push('');
// Full diffs section
if (processed.fullDiffs.length > 0) {
sections.push(`====== FULL DIFFS (${processed.fullDiffs.length} files) ======`);
sections.push(processed.fullDiffs.join('\n\n'));
sections.push('');
}
// Summarized diffs section
if (processed.summarizedDiffs.length > 0) {
sections.push(`====== SUMMARIZED DIFFS (${processed.summarizedDiffs.length} files) ======`);
sections.push(processed.summarizedDiffs.join('\n\n'));
sections.push('');
}
// Metadata only section
if (processed.metadataOnly.length > 0) {
sections.push(`====== METADATA ONLY (${processed.metadataOnly.length} files) ======`);
sections.push(processed.metadataOnly.join('\n'));
sections.push('');
}
sections.push('====== END OF GIT DIFF ======');
return sections.join('\n');
}
/**
* Parse a single git diff string into file information
*/
private parseDiffFile(diffString: string): IDiffFileInfo | null {
if (!diffString || diffString.trim().length === 0) {
return null;
}
const lines = diffString.split('\n');
let filepath = '';
let status: 'added' | 'modified' | 'deleted' = 'modified';
let linesAdded = 0;
let linesRemoved = 0;
// Parse diff header to extract filepath and status
for (const line of lines) {
if (line.startsWith('--- a/')) {
filepath = line.substring(6);
} else if (line.startsWith('+++ b/')) {
const newPath = line.substring(6);
if (newPath === '/dev/null') {
status = 'deleted';
} else if (filepath === '/dev/null') {
status = 'added';
filepath = newPath;
} else {
filepath = newPath;
}
} else if (line.startsWith('+') && !line.startsWith('+++')) {
linesAdded++;
} else if (line.startsWith('-') && !line.startsWith('---')) {
linesRemoved++;
}
}
const totalLines = linesAdded + linesRemoved;
const estimatedTokens = Math.ceil(diffString.length / 4);
return {
filepath,
status,
linesAdded,
linesRemoved,
totalLines,
estimatedTokens,
diffContent: diffString,
};
}
/**
* Prioritize files by importance (source files before build artifacts)
*/
private prioritizeFiles(files: IDiffFileInfo[]): IDiffFileInfo[] {
return files.sort((a, b) => {
const scoreA = this.getFileImportanceScore(a.filepath);
const scoreB = this.getFileImportanceScore(b.filepath);
return scoreB - scoreA; // Higher score first
});
}
/**
* Calculate importance score for a file path
*/
private getFileImportanceScore(filepath: string): number {
// Source files - highest priority
if (filepath.match(/^(src|lib|app|components|pages|api)\//)) {
return 100;
}
// Test files - high priority
if (filepath.match(/\.(test|spec)\.(ts|js|tsx|jsx)$/) || filepath.startsWith('test/')) {
return 80;
}
// Configuration files - medium-high priority
if (filepath.match(/\.(json|yaml|yml|toml|config\.(ts|js))$/)) {
return 60;
}
// Documentation - medium priority
if (filepath.match(/\.(md|txt|rst)$/)) {
return 40;
}
// Build artifacts - low priority
if (filepath.match(/^(dist|build|out|\.next|public\/dist)\//)) {
return 10;
}
// Everything else - default priority
return 50;
}
/**
* Extract head and tail lines from a diff, omitting the middle
*/
private extractDiffSample(fileInfo: IDiffFileInfo, headLines: number, tailLines: number): string {
const lines = fileInfo.diffContent.split('\n');
const totalLines = lines.length;
if (totalLines <= headLines + tailLines) {
// File is small enough to include fully
return fileInfo.diffContent;
}
// Extract file metadata from diff header
const headerLines: string[] = [];
let bodyStartIndex = 0;
for (let i = 0; i < lines.length; i++) {
if (lines[i].startsWith('@@')) {
headerLines.push(...lines.slice(0, i + 1));
bodyStartIndex = i + 1;
break;
}
}
const bodyLines = lines.slice(bodyStartIndex);
const head = bodyLines.slice(0, headLines);
const tail = bodyLines.slice(-tailLines);
const omittedLines = bodyLines.length - headLines - tailLines;
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
const parts: string[] = [];
parts.push(`${statusEmoji} FILE: ${fileInfo.filepath}`);
parts.push(`CHANGES: +${fileInfo.linesAdded} lines, -${fileInfo.linesRemoved} lines (${fileInfo.totalLines} total)`);
parts.push('');
parts.push(...headerLines);
parts.push(...head);
parts.push('');
parts.push(`[... ${omittedLines} lines omitted - use Read tool to see full file ...]`);
parts.push('');
parts.push(...tail);
return parts.join('\n');
}
/**
* Get file status prefix with emoji
*/
private getFileStatusPrefix(fileInfo: IDiffFileInfo): string {
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
return `${statusEmoji} `;
}
/**
* Extract filepath from diff content
*/
private extractFilepathFromDiff(diffContent: string): string {
const lines = diffContent.split('\n');
for (const line of lines) {
if (line.startsWith('+++ b/')) {
return line.substring(6);
}
}
return 'unknown';
}
/**
* Format file info as metadata only
*/
private formatMetadataOnly(fileInfo: IDiffFileInfo): string {
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
return `${statusEmoji} ${fileInfo.filepath} (+${fileInfo.linesAdded}, -${fileInfo.linesRemoved})`;
}
/**
* Generate human-readable summary of processed diff
*/
private generateSummary(result: IProcessedDiff): string {
const parts: string[] = [];
parts.push(`Files changed: ${result.totalFiles} total`);
parts.push(`- ${result.fullDiffs.length} included in full`);
parts.push(`- ${result.summarizedDiffs.length} summarized (head/tail shown)`);
parts.push(`- ${result.metadataOnly.length} metadata only`);
parts.push(`Estimated tokens: ~${result.totalTokens.toLocaleString()}`);
if (result.metadataOnly.length > 0) {
parts.push('');
parts.push('NOTE: Some files excluded to stay within token budget.');
parts.push('Use Read tool with specific file paths to see full content.');
}
return parts.join('\n');
}
}