xinvoice/ts/formats/pdf/pdf.extractor.ts

import {
  BaseXMLExtractor,
  StandardXMLExtractor,
  AssociatedFilesExtractor,
  TextXMLExtractor
} from './extractors/index.js';

/**
 * Main PDF extractor class that orchestrates the extraction process
 * Uses multiple specialized extractors in sequence to maximize success rate
 */
export class PDFExtractor {
  private extractors: BaseXMLExtractor[] = [];

  /**
   * Constructor initializes the chain of extractors
   */
  constructor() {
    // Add extractors in order of preference/likelihood of success
    this.extractors.push(
      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
      new TextXMLExtractor()          // Text-based extraction (fallback)
    );
  }

  /**
   * Extract XML from a PDF buffer
   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      console.log('Starting XML extraction from PDF...');

      // Try each extractor in sequence
      for (const extractor of this.extractors) {
        const extractorName = extractor.constructor.name;
        console.log(`Trying extraction with ${extractorName}...`);

        const xml = await extractor.extractXml(pdfBuffer);
        if (xml) {
          console.log(`Successfully extracted XML using ${extractorName}`);
          return xml;
        }

        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

      // If all extractors fail, return null
      console.warn('All extraction methods failed, no valid XML found in PDF');
      return null;
    } catch (error) {
      console.error('Error extracting XML from PDF:', error);
      return null;
    }
  }


}
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`import {`
			`BaseXMLExtractor,`
			`StandardXMLExtractor,`
			`AssociatedFilesExtractor,`
			`TextXMLExtractor`
			`} from './extractors/index.js';`
update 2025-04-03 15:53:08 +00:00
			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Main PDF extractor class that orchestrates the extraction process`
			`* Uses multiple specialized extractors in sequence to maximize success rate`
update 2025-04-03 15:53:08 +00:00			`*/`
			`export class PDFExtractor {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`private extractors: BaseXMLExtractor[] = [];`
update 2025-04-03 17:21:36 +00:00
			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Constructor initializes the chain of extractors`
update 2025-04-03 17:21:36 +00:00			`*/`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`constructor() {`
			`// Add extractors in order of preference/likelihood of success`
			`this.extractors.push(`
			`new StandardXMLExtractor(), // Standard PDF/A-3 embedded files`
			`new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)`
			`new TextXMLExtractor() // Text-based extraction (fallback)`
			`);`
update 2025-04-03 17:21:36 +00:00			`}`

			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Extract XML from a PDF buffer`
			`* Tries multiple extraction methods in sequence`
update 2025-04-03 17:21:36 +00:00			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`public async extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
update 2025-04-03 17:21:36 +00:00			`try {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`console.log('Starting XML extraction from PDF...');`
update 2025-04-03 17:21:36 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Try each extractor in sequence`
			`for (const extractor of this.extractors) {`
			`const extractorName = extractor.constructor.name;`
			console.log(`Trying extraction with ${extractorName}...`);
update 2025-04-03 17:21:36 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`const xml = await extractor.extractXml(pdfBuffer);`
			`if (xml) {`
			console.log(`Successfully extracted XML using ${extractorName}`);
			`return xml;`
update 2025-04-03 17:21:36 +00:00			`}`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			console.log(`Extraction with ${extractorName} failed, trying next method...`);
update 2025-04-03 17:21:36 +00:00			`}`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// If all extractors fail, return null`
			`console.warn('All extraction methods failed, no valid XML found in PDF');`
			`return null;`
update 2025-04-03 17:21:36 +00:00			`} catch (error) {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`console.error('Error extracting XML from PDF:', error);`
working 2025-04-03 16:41:10 +00:00			`return null;`
update 2025-04-03 15:53:08 +00:00			`}`
			`}`
update 2025-04-03 17:21:36 +00:00



update 2025-04-03 15:53:08 +00:00			`}`