tsdoc/ts/context/iterative-context-builder.ts

import * as plugins from '../plugins.js';
import * as fs from 'fs';
import { logger } from '../logging.js';
import type {
  TaskType,
  IFileMetadata,
  IFileInfo,
  IIterativeContextResult,
  IIterationState,
  IFileSelectionDecision,
  IContextSufficiencyDecision,
  IIterativeConfig,
} from './types.js';
import { LazyFileLoader } from './lazy-file-loader.js';
import { ContextCache } from './context-cache.js';
import { ContextAnalyzer } from './context-analyzer.js';
import { ConfigManager } from './config-manager.js';

/**
 * Iterative context builder that uses AI to intelligently select files
 * across multiple iterations until sufficient context is gathered
 */
export class IterativeContextBuilder {
  private projectRoot: string;
  private lazyLoader: LazyFileLoader;
  private cache: ContextCache;
  private analyzer: ContextAnalyzer;
  private config: Required<IIterativeConfig>;
  private tokenBudget: number = 190000;
  private openaiInstance: plugins.smartai.OpenAiProvider;
  private externalOpenaiInstance?: plugins.smartai.OpenAiProvider;

  /**
   * Creates a new IterativeContextBuilder
   * @param projectRoot - Root directory of the project
   * @param config - Iterative configuration
   * @param openaiInstance - Optional pre-configured OpenAI provider instance
   */
  constructor(
    projectRoot: string,
    config?: Partial<IIterativeConfig>,
    openaiInstance?: plugins.smartai.OpenAiProvider
  ) {
    this.projectRoot = projectRoot;
    this.lazyLoader = new LazyFileLoader(projectRoot);
    this.cache = new ContextCache(projectRoot);
    this.analyzer = new ContextAnalyzer(projectRoot);
    this.externalOpenaiInstance = openaiInstance;

    // Default configuration
    this.config = {
      maxIterations: config?.maxIterations ?? 5,
      firstPassFileLimit: config?.firstPassFileLimit ?? 10,
      subsequentPassFileLimit: config?.subsequentPassFileLimit ?? 5,
      temperature: config?.temperature ?? 0.3,
      model: config?.model ?? 'gpt-4-turbo-preview',
    };

  }

  /**
   * Initialize the builder
   */
  public async initialize(): Promise<void> {
    await this.cache.init();
    const configManager = ConfigManager.getInstance();
    await configManager.initialize(this.projectRoot);
    this.tokenBudget = configManager.getMaxTokens();

    // Use external OpenAI instance if provided, otherwise create a new one
    if (this.externalOpenaiInstance) {
      this.openaiInstance = this.externalOpenaiInstance;
    } else {
      // Initialize OpenAI instance from environment
      const qenvInstance = new plugins.qenv.Qenv();
      const openaiToken = await qenvInstance.getEnvVarOnDemand('OPENAI_TOKEN');
      if (!openaiToken) {
        throw new Error('OPENAI_TOKEN environment variable is required for iterative context building');
      }
      this.openaiInstance = new plugins.smartai.OpenAiProvider({
        openaiToken,
      });
      await this.openaiInstance.start();
    }
  }

  /**
   * Build context iteratively using AI decision making
   * @param taskType - Type of task being performed
   * @param additionalContext - Optional additional context (e.g., git diff for commit tasks)
   * @returns Complete iterative context result
   */
  public async buildContextIteratively(taskType: TaskType, additionalContext?: string): Promise<IIterativeContextResult> {
    const startTime = Date.now();
    logger.log('info', '🤖 Starting iterative context building...');
    logger.log('info', `   Task: ${taskType}, Budget: ${this.tokenBudget} tokens, Max iterations: ${this.config.maxIterations}`);

    // Phase 1: Scan project files for metadata
    logger.log('info', '📋 Scanning project files...');
    const metadata = await this.scanProjectFiles(taskType);
    const totalEstimatedTokens = metadata.reduce((sum, m) => sum + m.estimatedTokens, 0);
    logger.log('info', `   Found ${metadata.length} files (~${totalEstimatedTokens} estimated tokens)`);

    // Phase 2: Analyze files for initial prioritization
    logger.log('info', '🔍 Analyzing file dependencies and importance...');
    const analysis = await this.analyzer.analyze(metadata, taskType, []);
    logger.log('info', `   Analysis complete in ${analysis.analysisDuration}ms`);

    // Track state across iterations
    const iterations: IIterationState[] = [];
    let totalTokensUsed = 0;
    let apiCallCount = 0;
    let loadedContent = '';
    const includedFiles: IFileInfo[] = [];

    // If additional context (e.g., git diff) is provided, prepend it
    if (additionalContext) {
      // CRITICAL SAFETY: Check raw string size BEFORE tokenization to prevent OOM
      const MAX_DIFF_CHARS = 500000; // ~125k tokens max (conservative 4 chars/token ratio)
      const MAX_DIFF_TOKENS = 150000; // Hard token limit for safety

      // First check: raw character count
      if (additionalContext.length > MAX_DIFF_CHARS) {
        const originalSize = additionalContext.length;
        logger.log('warn', `⚠️  Git diff too large (${originalSize.toLocaleString()} chars > ${MAX_DIFF_CHARS.toLocaleString()} limit)`);
        logger.log('warn', `   This likely includes build artifacts (dist/, *.js.map, bundles, etc.)`);
        logger.log('warn', `   Truncating to first ${MAX_DIFF_CHARS.toLocaleString()} characters.`);
        logger.log('warn', `   Consider: git stash build files, improve .gitignore, or review uncommitted changes.`);

        additionalContext = additionalContext.substring(0, MAX_DIFF_CHARS) +
          '\n\n[... DIFF TRUNCATED - exceeded size limit of ' + MAX_DIFF_CHARS.toLocaleString() + ' chars ...]';
      }

      const diffSection = `
====== GIT DIFF ======

${additionalContext}

====== END OF GIT DIFF ======
`;

      // Second check: actual token count after truncation
      const diffTokens = this.countTokens(diffSection);

      if (diffTokens > MAX_DIFF_TOKENS) {
        logger.log('error', `❌ Git diff still too large after truncation (${diffTokens.toLocaleString()} tokens > ${MAX_DIFF_TOKENS.toLocaleString()} limit)`);
        throw new Error(
          `Git diff size (${diffTokens.toLocaleString()} tokens) exceeds maximum (${MAX_DIFF_TOKENS.toLocaleString()} tokens). ` +
          `This indicates massive uncommitted changes, likely build artifacts. ` +
          `Please commit or stash dist/, build/, or other generated files.`
        );
      }

      loadedContent = diffSection;
      totalTokensUsed += diffTokens;
      logger.log('info', `📝 Added git diff to context (${diffTokens.toLocaleString()} tokens)`);
    }

    // Phase 3: Iterative file selection and loading
    for (let iteration = 1; iteration <= this.config.maxIterations; iteration++) {
      const iterationStart = Date.now();
      logger.log('info', `\n🤔 Iteration ${iteration}/${this.config.maxIterations}: Asking AI which files to examine...`);

      const remainingBudget = this.tokenBudget - totalTokensUsed;
      logger.log('info', `   Token budget remaining: ${remainingBudget}/${this.tokenBudget} (${Math.round((remainingBudget / this.tokenBudget) * 100)}%)`);

      // Get AI decision on which files to load
      const decision = await this.getFileSelectionDecision(
        metadata,
        analysis.files.slice(0, 30), // Top 30 files by importance
        taskType,
        iteration,
        totalTokensUsed,
        remainingBudget,
        loadedContent
      );
      apiCallCount++;

      logger.log('info', `   AI reasoning: ${decision.reasoning}`);
      logger.log('info', `   AI requested ${decision.filesToLoad.length} files`);

      // Load requested files
      const iterationFiles: IFileInfo[] = [];
      let iterationTokens = 0;

      if (decision.filesToLoad.length > 0) {
        logger.log('info', '📥 Loading requested files...');

        for (const filePath of decision.filesToLoad) {
          try {
            const fileInfo = await this.loadFile(filePath);
            if (totalTokensUsed + fileInfo.tokenCount! <= this.tokenBudget) {
              const formattedFile = this.formatFileForContext(fileInfo);
              loadedContent += formattedFile;
              includedFiles.push(fileInfo);
              iterationFiles.push(fileInfo);
              iterationTokens += fileInfo.tokenCount!;
              totalTokensUsed += fileInfo.tokenCount!;

              logger.log('info', `   ✓ ${fileInfo.relativePath} (${fileInfo.tokenCount} tokens)`);
            } else {
              logger.log('warn', `   ✗ ${fileInfo.relativePath} - would exceed budget, skipping`);
            }
          } catch (error) {
            logger.log('warn', `   ✗ Failed to load ${filePath}: ${error.message}`);
          }
        }
      }

      // Record iteration state
      const iterationDuration = Date.now() - iterationStart;
      iterations.push({
        iteration,
        filesLoaded: iterationFiles,
        tokensUsed: iterationTokens,
        totalTokensUsed,
        decision,
        duration: iterationDuration,
      });

      logger.log('info', `   Iteration ${iteration} complete: ${iterationFiles.length} files loaded, ${iterationTokens} tokens used`);

      // Check if we should continue
      if (totalTokensUsed >= this.tokenBudget * 0.95) {
        logger.log('warn', '⚠️  Approaching token budget limit, stopping iterations');
        break;
      }

      // Ask AI if context is sufficient
      if (iteration < this.config.maxIterations) {
        logger.log('info', '🤔 Asking AI if context is sufficient...');
        const sufficiencyDecision = await this.evaluateContextSufficiency(
          loadedContent,
          taskType,
          iteration,
          totalTokensUsed,
          remainingBudget - iterationTokens
        );
        apiCallCount++;

        logger.log('info', `   AI decision: ${sufficiencyDecision.sufficient ? '✅ SUFFICIENT' : '⏭️  NEEDS MORE'}`);
        logger.log('info', `   Reasoning: ${sufficiencyDecision.reasoning}`);

        if (sufficiencyDecision.sufficient) {
          logger.log('ok', '✅ Context building complete - AI determined context is sufficient');
          break;
        }
      }
    }

    const totalDuration = Date.now() - startTime;
    logger.log('ok', `\n✅ Iterative context building complete!`);
    logger.log('info', `   Files included: ${includedFiles.length}`);
    logger.log('info', `   Token usage: ${totalTokensUsed}/${this.tokenBudget} (${Math.round((totalTokensUsed / this.tokenBudget) * 100)}%)`);
    logger.log('info', `   Iterations: ${iterations.length}, API calls: ${apiCallCount}`);
    logger.log('info', `   Total duration: ${(totalDuration / 1000).toFixed(2)}s`);

    return {
      context: loadedContent,
      tokenCount: totalTokensUsed,
      includedFiles,
      trimmedFiles: [],
      excludedFiles: [],
      tokenSavings: 0,
      iterationCount: iterations.length,
      iterations,
      apiCallCount,
      totalDuration,
    };
  }

  /**
   * Scan project files based on task type
   */
  private async scanProjectFiles(taskType: TaskType): Promise<IFileMetadata[]> {
    const configManager = ConfigManager.getInstance();
    const taskConfig = configManager.getTaskConfig(taskType);

    const includeGlobs = taskConfig?.includePaths?.map(p => `${p}/**/*.ts`) || [
      'ts/**/*.ts',
      'ts*/**/*.ts'
    ];

    const configGlobs = [
      'package.json',
      'readme.md',
      'readme.hints.md',
      'npmextra.json'
    ];

    return await this.lazyLoader.scanFiles([...configGlobs, ...includeGlobs]);
  }

  /**
   * Get AI decision on which files to load
   */
  private async getFileSelectionDecision(
    allMetadata: IFileMetadata[],
    analyzedFiles: any[],
    taskType: TaskType,
    iteration: number,
    tokensUsed: number,
    remainingBudget: number,
    loadedContent: string
  ): Promise<IFileSelectionDecision> {
    const isFirstIteration = iteration === 1;
    const fileLimit = isFirstIteration
      ? this.config.firstPassFileLimit
      : this.config.subsequentPassFileLimit;

    const systemPrompt = this.buildFileSelectionPrompt(
      allMetadata,
      analyzedFiles,
      taskType,
      iteration,
      tokensUsed,
      remainingBudget,
      loadedContent,
      fileLimit
    );

    const response = await this.openaiInstance.chat({
      systemMessage: `You are an AI assistant that helps select the most relevant files for code analysis.
You must respond ONLY with valid JSON that can be parsed with JSON.parse().
Do not wrap the JSON in markdown code blocks or add any other text.`,
      userMessage: systemPrompt,
      messageHistory: [],
    });

    // Parse JSON response, handling potential markdown formatting
    const content = response.message.replace('```json', '').replace('```', '').trim();
    const parsed = JSON.parse(content);

    return {
      reasoning: parsed.reasoning || 'No reasoning provided',
      filesToLoad: parsed.files_to_load || [],
      estimatedTokensNeeded: parsed.estimated_tokens_needed,
    };
  }

  /**
   * Build prompt for file selection
   */
  private buildFileSelectionPrompt(
    metadata: IFileMetadata[],
    analyzedFiles: any[],
    taskType: TaskType,
    iteration: number,
    tokensUsed: number,
    remainingBudget: number,
    loadedContent: string,
    fileLimit: number
  ): string {
    const taskDescriptions = {
      readme: 'generating a comprehensive README that explains the project\'s purpose, features, and API',
      commit: 'analyzing code changes to generate an intelligent commit message',
      description: 'generating a concise project description for package.json',
    };

    const alreadyLoadedFiles = loadedContent
      ? loadedContent.split('\n======').slice(1).map(section => {
          const match = section.match(/START OF FILE (.+?) ======/);
          return match ? match[1] : '';
        }).filter(Boolean)
      : [];

    const availableFiles = metadata
      .filter(m => !alreadyLoadedFiles.includes(m.relativePath))
      .map(m => {
        const analysis = analyzedFiles.find(a => a.path === m.path);
        return `- ${m.relativePath} (${m.size} bytes, ~${m.estimatedTokens} tokens${analysis ? `, importance: ${analysis.importanceScore.toFixed(2)}` : ''})`;
      })
      .join('\n');

    return `You are building context for ${taskDescriptions[taskType]} in a TypeScript project.

ITERATION: ${iteration}
TOKENS USED: ${tokensUsed}/${tokensUsed + remainingBudget} (${Math.round((tokensUsed / (tokensUsed + remainingBudget)) * 100)}%)
REMAINING BUDGET: ${remainingBudget} tokens

${alreadyLoadedFiles.length > 0 ? `FILES ALREADY LOADED:\n${alreadyLoadedFiles.map(f => `- ${f}`).join('\n')}\n\n` : ''}AVAILABLE FILES (not yet loaded):
${availableFiles}

Your task: Select up to ${fileLimit} files that will give you the MOST understanding for this ${taskType} task.

${iteration === 1 ? `This is the FIRST iteration. Focus on:
- Main entry points (index.ts, main exports)
- Core classes and interfaces
- Package configuration
` : `This is iteration ${iteration}. You've already seen some files. Now focus on:
- Files that complement what you've already loaded
- Dependencies of already-loaded files
- Missing pieces for complete understanding
`}

Consider:
1. File importance scores (if provided)
2. File paths (ts/index.ts is likely more important than ts/internal/utils.ts)
3. Token efficiency (prefer smaller files if they provide good information)
4. Remaining budget (${remainingBudget} tokens)

Respond in JSON format:
{
  "reasoning": "Brief explanation of why you're selecting these files",
  "files_to_load": ["path/to/file1.ts", "path/to/file2.ts"],
  "estimated_tokens_needed": 15000
}`;
  }

  /**
   * Evaluate if current context is sufficient
   */
  private async evaluateContextSufficiency(
    loadedContent: string,
    taskType: TaskType,
    iteration: number,
    tokensUsed: number,
    remainingBudget: number
  ): Promise<IContextSufficiencyDecision> {
    const prompt = `You have been building context for a ${taskType} task across ${iteration} iterations.

CURRENT STATE:
- Tokens used: ${tokensUsed}
- Remaining budget: ${remainingBudget}
- Files loaded: ${loadedContent.split('\n======').length - 1}

CONTEXT SO FAR:
${loadedContent.substring(0, 3000)}... (truncated for brevity)

Question: Do you have SUFFICIENT context to successfully complete the ${taskType} task?

Consider:
- For README: Do you understand the project's purpose, main features, API surface, and usage patterns?
- For commit: Do you understand what changed and why?
- For description: Do you understand the project's core value proposition?

Respond in JSON format:
{
  "sufficient": true or false,
  "reasoning": "Detailed explanation of your decision"
}`;

    const response = await this.openaiInstance.chat({
      systemMessage: `You are an AI assistant that evaluates whether gathered context is sufficient for a task.
You must respond ONLY with valid JSON that can be parsed with JSON.parse().
Do not wrap the JSON in markdown code blocks or add any other text.`,
      userMessage: prompt,
      messageHistory: [],
    });

    // Parse JSON response, handling potential markdown formatting
    const content = response.message.replace('```json', '').replace('```', '').trim();
    const parsed = JSON.parse(content);

    return {
      sufficient: parsed.sufficient || false,
      reasoning: parsed.reasoning || 'No reasoning provided',
    };
  }

  /**
   * Load a single file with caching
   */
  private async loadFile(filePath: string): Promise<IFileInfo> {
    // Try cache first
    const cached = await this.cache.get(filePath);
    if (cached) {
      return {
        path: filePath,
        relativePath: plugins.path.relative(this.projectRoot, filePath),
        contents: cached.contents,
        tokenCount: cached.tokenCount,
      };
    }

    // Load from disk
    const contents = await plugins.smartfile.fs.toStringSync(filePath);
    const tokenCount = this.countTokens(contents);
    const relativePath = plugins.path.relative(this.projectRoot, filePath);

    // Cache it
    const stats = await fs.promises.stat(filePath);
    await this.cache.set({
      path: filePath,
      contents,
      tokenCount,
      mtime: Math.floor(stats.mtimeMs),
      cachedAt: Date.now(),
    });

    return {
      path: filePath,
      relativePath,
      contents,
      tokenCount,
    };
  }

  /**
   * Format a file for inclusion in context
   */
  private formatFileForContext(file: IFileInfo): string {
    return `
====== START OF FILE ${file.relativePath} ======

${file.contents}

====== END OF FILE ${file.relativePath} ======
`;
  }

  /**
   * Count tokens in text
   */
  private countTokens(text: string): number {
    try {
      const tokens = plugins.gptTokenizer.encode(text);
      return tokens.length;
    } catch (error) {
      return Math.ceil(text.length / 4);
    }
  }
}