import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib'; import { BaseXMLExtractor } from './base.extractor.js'; /** * Standard PDF XML extractor that extracts XML from embedded files * Works with PDF/A-3 documents that follow the standard for embedding files */ export class StandardXMLExtractor extends BaseXMLExtractor { /** * Extract XML from a PDF buffer using standard PDF/A-3 embedded files * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { const pdfDoc = await PDFDocument.load(pdfBuffer); // Get the document's metadata dictionary const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names')); if (!(namesDictObj instanceof PDFDict)) { console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.'); return null; } // Get the embedded files dictionary const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles')); if (!(embeddedFilesDictObj instanceof PDFDict)) { console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.'); return null; } // Get the names array const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names')); if (!(filesSpecObj instanceof PDFArray)) { console.warn('No files specified in EmbeddedFiles dictionary!'); return null; } // Try to find an XML file in the embedded files for (let i = 0; i < filesSpecObj.size(); i += 2) { const fileNameObj = filesSpecObj.lookup(i); const fileSpecObj = filesSpecObj.lookup(i + 1); if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) { continue; } // Get the filename as string const fileName = fileNameObj.decodeText(); // Check if it's a known invoice XML file name const isKnownFileName = this.knownFileNames.some( knownName => fileName.toLowerCase() === knownName.toLowerCase() ); // Check if it's any XML file or has invoice-related keywords const isXmlFile = fileName.toLowerCase().endsWith('.xml') || fileName.toLowerCase().includes('zugferd') || fileName.toLowerCase().includes('factur-x') || fileName.toLowerCase().includes('xrechnung') || fileName.toLowerCase().includes('invoice'); if (isKnownFileName || isXmlFile) { const efDictObj = fileSpecObj.lookup(PDFName.of('EF')); if (!(efDictObj instanceof PDFDict)) { continue; } const fileStream = efDictObj.lookup(PDFName.of('F')); if (fileStream instanceof PDFRawStream) { const xmlContent = await this.extractXmlFromStream(fileStream, fileName); if (xmlContent) { return xmlContent; } } } } console.warn('No valid XML found in embedded files'); return null; } catch (error) { console.error('Error in standard extraction:', error); return null; } } }