import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib'; import * as pako from 'pako'; /** * Class for extracting XML from PDF files */ export class PDFExtractor { /** * Extracts XML from a PDF buffer * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { const pdfDoc = await PDFDocument.load(pdfBuffer); // Get the document's metadata dictionary const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names')); if (!(namesDictObj instanceof PDFDict)) { console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.'); return null; } const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles')); if (!(embeddedFilesDictObj instanceof PDFDict)) { console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.'); return null; } const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names')); if (!(filesSpecObj instanceof PDFArray)) { console.warn('No files specified in EmbeddedFiles dictionary!'); return null; } // Try to find an XML file in the embedded files let xmlFile: PDFRawStream | undefined; let xmlFileName: string | undefined; for (let i = 0; i < filesSpecObj.size(); i += 2) { const fileNameObj = filesSpecObj.lookup(i); const fileSpecObj = filesSpecObj.lookup(i + 1); if (!(fileNameObj instanceof PDFString)) { continue; } if (!(fileSpecObj instanceof PDFDict)) { continue; } // Get the filename as string const fileName = fileNameObj.toString(); // Check if it's an XML file (checking both extension and known standard filenames) if (fileName.toLowerCase().includes('.xml') || fileName.toLowerCase().includes('factur-x') || fileName.toLowerCase().includes('zugferd') || fileName.toLowerCase().includes('xrechnung')) { const efDictObj = fileSpecObj.lookup(PDFName.of('EF')); if (!(efDictObj instanceof PDFDict)) { continue; } const maybeStream = efDictObj.lookup(PDFName.of('F')); if (maybeStream instanceof PDFRawStream) { // Found an XML file - save it xmlFile = maybeStream; xmlFileName = fileName; break; } } } // If no XML file was found, return null if (!xmlFile) { console.warn('No embedded XML file found in the PDF!'); return null; } // Decompress and decode the XML content const xmlCompressedBytes = xmlFile.getContents().buffer; const xmlBytes = pako.inflate(xmlCompressedBytes); const xmlContent = new TextDecoder('utf-8').decode(xmlBytes); console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`); return xmlContent; } catch (error) { console.error('Error extracting or parsing embedded XML from PDF:', error); throw error; } } }