import { BaseXMLExtractor, StandardXMLExtractor, AssociatedFilesExtractor, TextXMLExtractor } from './extractors/index.js'; /** * Main PDF extractor class that orchestrates the extraction process * Uses multiple specialized extractors in sequence to maximize success rate */ export class PDFExtractor { private extractors: BaseXMLExtractor[] = []; /** * Constructor initializes the chain of extractors */ constructor() { // Add extractors in order of preference/likelihood of success this.extractors.push( new StandardXMLExtractor(), // Standard PDF/A-3 embedded files new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X) new TextXMLExtractor() // Text-based extraction (fallback) ); } /** * Extract XML from a PDF buffer * Tries multiple extraction methods in sequence * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { console.log('Starting XML extraction from PDF...'); // Try each extractor in sequence for (const extractor of this.extractors) { const extractorName = extractor.constructor.name; console.log(`Trying extraction with ${extractorName}...`); const xml = await extractor.extractXml(pdfBuffer); if (xml) { console.log(`Successfully extracted XML using ${extractorName}`); return xml; } console.log(`Extraction with ${extractorName} failed, trying next method...`); } // If all extractors fail, return null console.warn('All extraction methods failed, no valid XML found in PDF'); return null; } catch (error) { console.error('Error extracting XML from PDF:', error); return null; } } }