ts/formats/pdf/pdf.extractor.ts

import {
  BaseXMLExtractor,
  StandardXMLExtractor,
  AssociatedFilesExtractor,
  TextXMLExtractor
} from './extractors/index.js';
import { FormatDetector } from '../utils/format.detector.js';
import { InvoiceFormat } from '../../interfaces/common.js';

/**
 * Error types for PDF extraction operations
 */
export enum PDFExtractError {
  EXTRACT_ERROR = 'XML extraction failed',
  INVALID_INPUT = 'Invalid input parameters',
  NO_XML_FOUND = 'No XML found in PDF'
}

/**
 * Result of a PDF extraction operation
 */
export interface PDFExtractResult {
  success: boolean;
  xml?: string;
  format?: InvoiceFormat;
  extractorUsed?: string;
  error?: {
    type: PDFExtractError;
    message: string;
    originalError?: Error;
  };
}

/**
 * Main PDF extractor class that orchestrates the extraction process
 * Uses multiple specialized extractors in sequence to maximize success rate
 */
export class PDFExtractor {
  private extractors: BaseXMLExtractor[] = [];

  /**
   * Constructor initializes the chain of extractors
   */
  constructor() {
    // Add extractors in order of preference/likelihood of success
    this.extractors.push(
      new StandardXMLExtractor(),      // Standard PDF/A-3 embedded files
      new AssociatedFilesExtractor(),  // Associated files (ZUGFeRD v1, some Factur-X)
      new TextXMLExtractor()           // Text-based extraction (fallback)
    );
  }

  /**
   * Extract XML from a PDF buffer
   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
   * @returns Result with either the extracted XML or error information
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
    try {
      console.log('Starting XML extraction from PDF...');

      // Validate input
      if (!pdfBuffer || pdfBuffer.length === 0) {
        return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
      }

      // Ensure buffer is Uint8Array
      const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;

      // Try each extractor in sequence
      for (const extractor of this.extractors) {
        const extractorName = extractor.constructor.name;
        console.log(`Trying extraction with ${extractorName}...`);

        try {
          const xml = await extractor.extractXml(pdfBufferArray);
          
          if (xml) {
            console.log(`Successfully extracted XML using ${extractorName}`);
            
            // Detect format of the extracted XML
            const format = FormatDetector.detectFormat(xml);
            
            return {
              success: true,
              xml,
              format,
              extractorUsed: extractorName
            };
          }
          
          console.log(`Extraction with ${extractorName} failed, trying next method...`);
        } catch (error) {
          // Log error but continue with next extractor
          console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
        }
      }

      // If all extractors fail, return a no XML found error
      return this.createErrorResult(
        PDFExtractError.NO_XML_FOUND,
        'All extraction methods failed, no valid XML found in PDF'
      );
    } catch (error) {
      // Handle any unexpected errors
      return this.createErrorResult(
        PDFExtractError.EXTRACT_ERROR,
        `Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
        error instanceof Error ? error : undefined
      );
    }
  }

  /**
   * Create a PDF extract result with error information
   * @param type Error type
   * @param message Error message
   * @param originalError Original error object
   * @returns Error result
   */
  private createErrorResult(
    type: PDFExtractError,
    message: string,
    originalError?: Error
  ): PDFExtractResult {
    console.error(`PDF Extractor Error (${type}): ${message}`);
    if (originalError) {
      console.error(originalError);
    }
    
    return {
      success: false,
      error: {
        type,
        message,
        originalError
      }
    };
  }
}
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`import {`
			`BaseXMLExtractor,`
			`StandardXMLExtractor,`
			`AssociatedFilesExtractor,`
			`TextXMLExtractor`
			`} from './extractors/index.js';`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`import { FormatDetector } from '../utils/format.detector.js';`
			`import { InvoiceFormat } from '../../interfaces/common.js';`

			`/**`
			`* Error types for PDF extraction operations`
			`*/`
			`export enum PDFExtractError {`
			`EXTRACT_ERROR = 'XML extraction failed',`
			`INVALID_INPUT = 'Invalid input parameters',`
			`NO_XML_FOUND = 'No XML found in PDF'`
			`}`

			`/**`
			`* Result of a PDF extraction operation`
			`*/`
			`export interface PDFExtractResult {`
			`success: boolean;`
			`xml?: string;`
			`format?: InvoiceFormat;`
			`extractorUsed?: string;`
			`error?: {`
			`type: PDFExtractError;`
			`message: string;`
			`originalError?: Error;`
			`};`
			`}`
update 2025-04-03 15:53:08 +00:00
			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Main PDF extractor class that orchestrates the extraction process`
			`* Uses multiple specialized extractors in sequence to maximize success rate`
update 2025-04-03 15:53:08 +00:00			`*/`
			`export class PDFExtractor {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`private extractors: BaseXMLExtractor[] = [];`
update 2025-04-03 17:21:36 +00:00
			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Constructor initializes the chain of extractors`
update 2025-04-03 17:21:36 +00:00			`*/`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`constructor() {`
			`// Add extractors in order of preference/likelihood of success`
			`this.extractors.push(`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`new StandardXMLExtractor(), // Standard PDF/A-3 embedded files`
			`new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)`
			`new TextXMLExtractor() // Text-based extraction (fallback)`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`);`
update 2025-04-03 17:21:36 +00:00			`}`

			`/**`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* Extract XML from a PDF buffer`
			`* Tries multiple extraction methods in sequence`
update 2025-04-03 17:21:36 +00:00			`* @param pdfBuffer PDF buffer`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`* @returns Result with either the extracted XML or error information`
update 2025-04-03 17:21:36 +00:00			`*/`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`public async extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<PDFExtractResult> {`
update 2025-04-03 17:21:36 +00:00			`try {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`console.log('Starting XML extraction from PDF...');`
update 2025-04-03 17:21:36 +00:00
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Validate input`
			`if (!pdfBuffer \|\| pdfBuffer.length === 0) {`
			`return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');`
			`}`

			`// Ensure buffer is Uint8Array`
			`const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`// Try each extractor in sequence`
			`for (const extractor of this.extractors) {`
			`const extractorName = extractor.constructor.name;`
			console.log(`Trying extraction with ${extractorName}...`);
update 2025-04-03 17:21:36 +00:00
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`try {`
			`const xml = await extractor.extractXml(pdfBufferArray);`

			`if (xml) {`
			console.log(`Successfully extracted XML using ${extractorName}`);

			`// Detect format of the extracted XML`
			`const format = FormatDetector.detectFormat(xml);`

			`return {`
			`success: true,`
			`xml,`
			`format,`
			`extractorUsed: extractorName`
			`};`
			`}`

			console.log(`Extraction with ${extractorName} failed, trying next method...`);
			`} catch (error) {`
			`// Log error but continue with next extractor`
			console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
update 2025-04-03 17:21:36 +00:00			`}`
			`}`

fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// If all extractors fail, return a no XML found error`
			`return this.createErrorResult(`
			`PDFExtractError.NO_XML_FOUND,`
			`'All extraction methods failed, no valid XML found in PDF'`
			`);`
update 2025-04-03 17:21:36 +00:00			`} catch (error) {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Handle any unexpected errors`
			`return this.createErrorResult(`
			`PDFExtractError.EXTRACT_ERROR,`
			`Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
			`error instanceof Error ? error : undefined`
			`);`
update 2025-04-03 15:53:08 +00:00			`}`
			`}`
update 2025-04-03 17:21:36 +00:00
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`/**`
			`* Create a PDF extract result with error information`
			`* @param type Error type`
			`* @param message Error message`
			`* @param originalError Original error object`
			`* @returns Error result`
			`*/`
			`private createErrorResult(`
			`type: PDFExtractError,`
			`message: string,`
			`originalError?: Error`
			`): PDFExtractResult {`
			console.error(`PDF Extractor Error (${type}): ${message}`);
			`if (originalError) {`
			`console.error(originalError);`
			`}`

			`return {`
			`success: false,`
			`error: {`
			`type,`
			`message,`
			`originalError`
			`}`
			`};`
			`}`
			`}`