import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from '../../../plugins.js'; import { BaseXMLExtractor } from './base.extractor.js'; /** * Associated files extractor for PDF/A-3 documents * Extracts XML from associated files (AF entry in the catalog) * Particularly useful for ZUGFeRD v1 and some Factur-X documents */ export class AssociatedFilesExtractor extends BaseXMLExtractor { /** * Extract XML from a PDF buffer using associated files * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { const pdfDoc = await PDFDocument.load(pdfBuffer); // Try to find associated files via the AF entry in the catalog const afArray = pdfDoc.catalog.lookup(PDFName.of('AF')); if (!(afArray instanceof PDFArray)) { console.warn('No AF (Associated Files) entry found in PDF catalog'); return null; } // Process each associated file for (let i = 0; i < afArray.size(); i++) { const fileSpec = afArray.lookup(i); if (!(fileSpec instanceof PDFDict)) { continue; } // Get the file name const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF')); if (!(fileNameObj instanceof PDFString)) { continue; } const fileName = fileNameObj.decodeText(); // Check if it's a known invoice XML file name const isKnownFileName = this.knownFileNames.some( knownName => fileName.toLowerCase() === knownName.toLowerCase() ); // Check if it's any XML file or has invoice-related keywords const isXmlFile = fileName.toLowerCase().endsWith('.xml') || fileName.toLowerCase().includes('zugferd') || fileName.toLowerCase().includes('factur-x') || fileName.toLowerCase().includes('xrechnung') || fileName.toLowerCase().includes('invoice'); if (isKnownFileName || isXmlFile) { // Get the embedded file dictionary const efDict = fileSpec.lookup(PDFName.of('EF')); if (!(efDict instanceof PDFDict)) { continue; } // Get the file stream const fileStream = efDict.lookup(PDFName.of('F')); if (fileStream instanceof PDFRawStream) { const xmlContent = await this.extractXmlFromStream(fileStream, fileName); if (xmlContent) { return xmlContent; } } } } console.warn('No valid XML found in associated files'); return null; } catch (error) { console.error('Error in associated files extraction:', error); return null; } } }