import { BaseXMLExtractor, StandardXMLExtractor, AssociatedFilesExtractor, TextXMLExtractor } from './extractors/index.js'; import { FormatDetector } from '../utils/format.detector.js'; import { InvoiceFormat } from '../../interfaces/common.js'; /** * Error types for PDF extraction operations */ export enum PDFExtractError { EXTRACT_ERROR = 'XML extraction failed', INVALID_INPUT = 'Invalid input parameters', NO_XML_FOUND = 'No XML found in PDF' } /** * Result of a PDF extraction operation */ export interface PDFExtractResult { success: boolean; xml?: string; format?: InvoiceFormat; extractorUsed?: string; error?: { type: PDFExtractError; message: string; originalError?: Error; }; } /** * Main PDF extractor class that orchestrates the extraction process * Uses multiple specialized extractors in sequence to maximize success rate */ export class PDFExtractor { private extractors: BaseXMLExtractor[] = []; /** * Constructor initializes the chain of extractors */ constructor() { // Add extractors in order of preference/likelihood of success this.extractors.push( new StandardXMLExtractor(), // Standard PDF/A-3 embedded files new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X) new TextXMLExtractor() // Text-based extraction (fallback) ); } /** * Extract XML from a PDF buffer * Tries multiple extraction methods in sequence * @param pdfBuffer PDF buffer * @returns Result with either the extracted XML or error information */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { console.log('Starting XML extraction from PDF...'); // Validate input if (!pdfBuffer || pdfBuffer.length === 0) { return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined'); } // Ensure buffer is Uint8Array const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer; // Try each extractor in sequence for (const extractor of this.extractors) { const extractorName = extractor.constructor.name; console.log(`Trying extraction with ${extractorName}...`); try { const xml = await extractor.extractXml(pdfBufferArray); if (xml) { console.log(`Successfully extracted XML using ${extractorName}`); // Detect format of the extracted XML const format = FormatDetector.detectFormat(xml); return { success: true, xml, format, extractorUsed: extractorName }; } console.log(`Extraction with ${extractorName} failed, trying next method...`); } catch (error) { // Log error but continue with next extractor console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`); } } // If all extractors fail, return a no XML found error return this.createErrorResult( PDFExtractError.NO_XML_FOUND, 'All extraction methods failed, no valid XML found in PDF' ); } catch (error) { // Handle any unexpected errors return this.createErrorResult( PDFExtractError.EXTRACT_ERROR, `Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`, error instanceof Error ? error : undefined ); } } /** * Create a PDF extract result with error information * @param type Error type * @param message Error message * @param originalError Original error object * @returns Error result */ private createErrorResult( type: PDFExtractError, message: string, originalError?: Error ): PDFExtractResult { console.error(`PDF Extractor Error (${type}): ${message}`); if (originalError) { console.error(originalError); } return { success: false, error: { type, message, originalError } }; } }