Files
tsdoc/ts/context/diff-processor.ts

342 lines
11 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Intelligent git diff processor that handles large diffs by sampling and prioritization
* instead of blind truncation.
*/
export interface IDiffFileInfo {
filepath: string;
status: 'added' | 'modified' | 'deleted';
linesAdded: number;
linesRemoved: number;
totalLines: number;
estimatedTokens: number;
diffContent: string;
}
export interface IProcessedDiff {
summary: string; // Human-readable overview
fullDiffs: string[]; // Small files included fully
summarizedDiffs: string[]; // Medium files with head/tail
metadataOnly: string[]; // Large files, just stats
totalFiles: number;
totalTokens: number;
}
export interface IDiffProcessorOptions {
maxDiffTokens?: number; // Maximum tokens for entire diff section (default: 100000)
smallFileLines?: number; // Files <= this are included fully (default: 50)
mediumFileLines?: number; // Files <= this are summarized (default: 200)
sampleHeadLines?: number; // Lines to show at start of medium files (default: 20)
sampleTailLines?: number; // Lines to show at end of medium files (default: 20)
}
export class DiffProcessor {
private options: Required<IDiffProcessorOptions>;
constructor(options: IDiffProcessorOptions = {}) {
this.options = {
maxDiffTokens: options.maxDiffTokens ?? 100000,
smallFileLines: options.smallFileLines ?? 50,
mediumFileLines: options.mediumFileLines ?? 200,
sampleHeadLines: options.sampleHeadLines ?? 20,
sampleTailLines: options.sampleTailLines ?? 20,
};
}
/**
* Process an array of git diffs into a structured, token-efficient format
*/
public processDiffs(diffStringArray: string[]): IProcessedDiff {
// Parse all diffs into file info objects
const fileInfos: IDiffFileInfo[] = diffStringArray
.map(diffString => this.parseDiffFile(diffString))
.filter(info => info !== null) as IDiffFileInfo[];
// Prioritize files (source files first, build artifacts last)
const prioritized = this.prioritizeFiles(fileInfos);
const result: IProcessedDiff = {
summary: '',
fullDiffs: [],
summarizedDiffs: [],
metadataOnly: [],
totalFiles: prioritized.length,
totalTokens: 0,
};
let tokensUsed = 0;
const tokenBudget = this.options.maxDiffTokens;
// Categorize and include files based on size and token budget
for (const fileInfo of prioritized) {
const remainingBudget = tokenBudget - tokensUsed;
if (remainingBudget <= 0) {
// Budget exhausted - rest are metadata only
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
continue;
}
if (fileInfo.totalLines <= this.options.smallFileLines) {
// Small file - include fully if budget allows
if (fileInfo.estimatedTokens <= remainingBudget) {
const statusPrefix = this.getFileStatusPrefix(fileInfo);
result.fullDiffs.push(`${statusPrefix}${fileInfo.diffContent}`);
tokensUsed += fileInfo.estimatedTokens;
} else {
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
} else if (fileInfo.totalLines <= this.options.mediumFileLines) {
// Medium file - try to include summary with head/tail
const summary = this.extractDiffSample(
fileInfo,
this.options.sampleHeadLines,
this.options.sampleTailLines
);
const summaryTokens = Math.ceil(summary.length / 4); // Rough estimate
if (summaryTokens <= remainingBudget) {
result.summarizedDiffs.push(summary);
tokensUsed += summaryTokens;
} else {
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
} else {
// Large file - metadata only
result.metadataOnly.push(this.formatMetadataOnly(fileInfo));
}
}
result.totalTokens = tokensUsed;
result.summary = this.generateSummary(result);
return result;
}
/**
* Format the processed diff for inclusion in context
*/
public formatForContext(processed: IProcessedDiff): string {
const sections: string[] = [];
// Summary section
sections.push('====== GIT DIFF SUMMARY ======');
sections.push(processed.summary);
sections.push('');
// Full diffs section
if (processed.fullDiffs.length > 0) {
sections.push(`====== FULL DIFFS (${processed.fullDiffs.length} files) ======`);
sections.push(processed.fullDiffs.join('\n\n'));
sections.push('');
}
// Summarized diffs section
if (processed.summarizedDiffs.length > 0) {
sections.push(`====== SUMMARIZED DIFFS (${processed.summarizedDiffs.length} files) ======`);
sections.push(processed.summarizedDiffs.join('\n\n'));
sections.push('');
}
// Metadata only section
if (processed.metadataOnly.length > 0) {
sections.push(`====== METADATA ONLY (${processed.metadataOnly.length} files) ======`);
sections.push(processed.metadataOnly.join('\n'));
sections.push('');
}
sections.push('====== END OF GIT DIFF ======');
return sections.join('\n');
}
/**
* Parse a single git diff string into file information
*/
private parseDiffFile(diffString: string): IDiffFileInfo | null {
if (!diffString || diffString.trim().length === 0) {
return null;
}
const lines = diffString.split('\n');
let filepath = '';
let status: 'added' | 'modified' | 'deleted' = 'modified';
let linesAdded = 0;
let linesRemoved = 0;
// Parse diff header to extract filepath and status
for (const line of lines) {
if (line.startsWith('--- a/')) {
filepath = line.substring(6);
} else if (line.startsWith('+++ b/')) {
const newPath = line.substring(6);
if (newPath === '/dev/null') {
status = 'deleted';
} else if (filepath === '/dev/null') {
status = 'added';
filepath = newPath;
} else {
filepath = newPath;
}
} else if (line.startsWith('+') && !line.startsWith('+++')) {
linesAdded++;
} else if (line.startsWith('-') && !line.startsWith('---')) {
linesRemoved++;
}
}
const totalLines = linesAdded + linesRemoved;
const estimatedTokens = Math.ceil(diffString.length / 4);
return {
filepath,
status,
linesAdded,
linesRemoved,
totalLines,
estimatedTokens,
diffContent: diffString,
};
}
/**
* Prioritize files by importance (source files before build artifacts)
*/
private prioritizeFiles(files: IDiffFileInfo[]): IDiffFileInfo[] {
return files.sort((a, b) => {
const scoreA = this.getFileImportanceScore(a.filepath);
const scoreB = this.getFileImportanceScore(b.filepath);
return scoreB - scoreA; // Higher score first
});
}
/**
* Calculate importance score for a file path
*/
private getFileImportanceScore(filepath: string): number {
// Source files - highest priority
if (filepath.match(/^(src|lib|app|components|pages|api)\//)) {
return 100;
}
// Test files - high priority
if (filepath.match(/\.(test|spec)\.(ts|js|tsx|jsx)$/) || filepath.startsWith('test/')) {
return 80;
}
// Configuration files - medium-high priority
if (filepath.match(/\.(json|yaml|yml|toml|config\.(ts|js))$/)) {
return 60;
}
// Documentation - medium priority
if (filepath.match(/\.(md|txt|rst)$/)) {
return 40;
}
// Build artifacts - low priority
if (filepath.match(/^(dist|build|out|\.next|public\/dist)\//)) {
return 10;
}
// Everything else - default priority
return 50;
}
/**
* Extract head and tail lines from a diff, omitting the middle
*/
private extractDiffSample(fileInfo: IDiffFileInfo, headLines: number, tailLines: number): string {
const lines = fileInfo.diffContent.split('\n');
const totalLines = lines.length;
if (totalLines <= headLines + tailLines) {
// File is small enough to include fully
return fileInfo.diffContent;
}
// Extract file metadata from diff header
const headerLines: string[] = [];
let bodyStartIndex = 0;
for (let i = 0; i < lines.length; i++) {
if (lines[i].startsWith('@@')) {
headerLines.push(...lines.slice(0, i + 1));
bodyStartIndex = i + 1;
break;
}
}
const bodyLines = lines.slice(bodyStartIndex);
const head = bodyLines.slice(0, headLines);
const tail = bodyLines.slice(-tailLines);
const omittedLines = bodyLines.length - headLines - tailLines;
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
const parts: string[] = [];
parts.push(`${statusEmoji} FILE: ${fileInfo.filepath}`);
parts.push(`CHANGES: +${fileInfo.linesAdded} lines, -${fileInfo.linesRemoved} lines (${fileInfo.totalLines} total)`);
parts.push('');
parts.push(...headerLines);
parts.push(...head);
parts.push('');
parts.push(`[... ${omittedLines} lines omitted - use Read tool to see full file ...]`);
parts.push('');
parts.push(...tail);
return parts.join('\n');
}
/**
* Get file status prefix with emoji
*/
private getFileStatusPrefix(fileInfo: IDiffFileInfo): string {
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
return `${statusEmoji} `;
}
/**
* Extract filepath from diff content
*/
private extractFilepathFromDiff(diffContent: string): string {
const lines = diffContent.split('\n');
for (const line of lines) {
if (line.startsWith('+++ b/')) {
return line.substring(6);
}
}
return 'unknown';
}
/**
* Format file info as metadata only
*/
private formatMetadataOnly(fileInfo: IDiffFileInfo): string {
const statusEmoji = fileInfo.status === 'added' ? '' :
fileInfo.status === 'deleted' ? '' : '📝';
return `${statusEmoji} ${fileInfo.filepath} (+${fileInfo.linesAdded}, -${fileInfo.linesRemoved})`;
}
/**
* Generate human-readable summary of processed diff
*/
private generateSummary(result: IProcessedDiff): string {
const parts: string[] = [];
parts.push(`Files changed: ${result.totalFiles} total`);
parts.push(`- ${result.fullDiffs.length} included in full`);
parts.push(`- ${result.summarizedDiffs.length} summarized (head/tail shown)`);
parts.push(`- ${result.metadataOnly.length} metadata only`);
parts.push(`Estimated tokens: ~${result.totalTokens.toLocaleString()}`);
if (result.metadataOnly.length > 0) {
parts.push('');
parts.push('NOTE: Some files excluded to stay within token budget.');
parts.push('Use Read tool with specific file paths to see full content.');
}
return parts.join('\n');
}
}