xinvoice/ts/formats/pdf/pdf.extractor.ts

import {
  BaseXMLExtractor,
  StandardXMLExtractor,
  AssociatedFilesExtractor,
  TextXMLExtractor
} from './extractors/index.js';

/**
 * Main PDF extractor class that orchestrates the extraction process
 * Uses multiple specialized extractors in sequence to maximize success rate
 */
export class PDFExtractor {
  private extractors: BaseXMLExtractor[] = [];

  /**
   * Constructor initializes the chain of extractors
   */
  constructor() {
    // Add extractors in order of preference/likelihood of success
    this.extractors.push(
      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
      new TextXMLExtractor()          // Text-based extraction (fallback)
    );
  }

  /**
   * Extract XML from a PDF buffer
   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      console.log('Starting XML extraction from PDF...');

      // Try each extractor in sequence
      for (const extractor of this.extractors) {
        const extractorName = extractor.constructor.name;
        console.log(`Trying extraction with ${extractorName}...`);

        const xml = await extractor.extractXml(pdfBuffer);
        if (xml) {
          console.log(`Successfully extracted XML using ${extractorName}`);
          return xml;
        }

        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

      // If all extractors fail, return null
      console.warn('All extraction methods failed, no valid XML found in PDF');
      return null;
    } catch (error) {
      console.error('Error extracting XML from PDF:', error);
      return null;
    }
  }


}