import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib'; import * as pako from 'pako'; /** * Class for extracting XML from PDF files */ export class PDFExtractor { /** * Extracts XML from a PDF buffer * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { // First try the standard extraction const standardXml = await this.standardExtraction(pdfBuffer); if (standardXml && this.isValidXml(standardXml)) { return standardXml; } // If standard extraction fails, try alternative methods const alternativeXml = await this.alternativeExtraction(pdfBuffer); if (alternativeXml && this.isValidXml(alternativeXml)) { return alternativeXml; } // If all extraction methods fail, return null console.warn('All extraction methods failed, no valid XML found in PDF'); return null; } catch (error) { console.error('Error extracting XML from PDF:', error); return null; } } /** * Standard extraction method using PDF-lib * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise { try { const pdfDoc = await PDFDocument.load(pdfBuffer); // Get the document's metadata dictionary const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names')); if (!(namesDictObj instanceof PDFDict)) { console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.'); return null; } const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles')); if (!(embeddedFilesDictObj instanceof PDFDict)) { console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.'); return null; } const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names')); if (!(filesSpecObj instanceof PDFArray)) { console.warn('No files specified in EmbeddedFiles dictionary!'); return null; } // Try to find an XML file in the embedded files let xmlFile: PDFRawStream | undefined; let xmlFileName: string | undefined; for (let i = 0; i < filesSpecObj.size(); i += 2) { const fileNameObj = filesSpecObj.lookup(i); const fileSpecObj = filesSpecObj.lookup(i + 1); if (!(fileNameObj instanceof PDFString)) { continue; } if (!(fileSpecObj instanceof PDFDict)) { continue; } // Get the filename as string const fileName = fileNameObj.toString(); // Check if it's an XML file (checking both extension and known standard filenames) if (fileName.toLowerCase().includes('.xml') || fileName.toLowerCase().includes('factur-x') || fileName.toLowerCase().includes('zugferd') || fileName.toLowerCase().includes('xrechnung')) { const efDictObj = fileSpecObj.lookup(PDFName.of('EF')); if (!(efDictObj instanceof PDFDict)) { continue; } const maybeStream = efDictObj.lookup(PDFName.of('F')); if (maybeStream instanceof PDFRawStream) { // Found an XML file - save it xmlFile = maybeStream; xmlFileName = fileName; break; } } } // If no XML file was found, return null if (!xmlFile) { console.warn('No embedded XML file found in the PDF!'); return null; } // Decompress and decode the XML content try { // Try to decompress with pako const xmlCompressedBytes = xmlFile.getContents().buffer; const xmlBytes = pako.inflate(xmlCompressedBytes); const xmlContent = new TextDecoder('utf-8').decode(xmlBytes); // Check if the XML content is valid if (this.isValidXml(xmlContent)) { console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`); return xmlContent; } // If we get here, the XML content is not valid, try without decompression console.log('Decompression succeeded but XML is not valid, trying without decompression...'); const rawXmlBytes = xmlFile.getContents(); const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes); if (this.isValidXml(rawXmlContent)) { console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`); return rawXmlContent; } // If we get here, neither the decompressed nor the raw XML content is valid console.log('Neither decompressed nor raw XML content is valid'); return null; } catch (decompressError) { // Decompression failed, try without decompression console.log('Decompression failed, trying without decompression...'); try { const xmlBytes = xmlFile.getContents(); const xmlContent = new TextDecoder('utf-8').decode(xmlBytes); if (this.isValidXml(xmlContent)) { console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`); return xmlContent; } // If we get here, the XML content is not valid console.log('Uncompressed XML content is not valid'); return null; } catch (decodeError) { console.error('Error decoding XML content:', decodeError); return null; } } } catch (error) { console.error('Error in standard extraction:', error); return null; } } /** * Alternative extraction method using string search * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise { try { // Convert buffer to string and look for XML patterns const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000)); // Look for common XML patterns in the PDF const xmlPatterns = [ /<\?xml[^>]*\?>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i ]; for (const pattern of xmlPatterns) { const match = pdfString.match(pattern); if (match) { console.log(`Found XML pattern in PDF: ${match[0]}`); // Try to extract the XML content const xmlContent = this.extractXmlFromString(pdfString); if (xmlContent) { console.log('Successfully extracted XML from PDF string'); return xmlContent; } } } return null; } catch (error) { console.error('Error in alternative extraction:', error); return null; } } /** * Extracts XML from a string * @param pdfString PDF string * @returns XML content or null if not found */ private extractXmlFromString(pdfString: string): string | null { try { // Look for XML start and end tags const xmlStartIndex = pdfString.indexOf('', '', '', '' ]; let xmlEndIndex = -1; for (const endTag of possibleEndTags) { const endIndex = pdfString.indexOf(endTag); if (endIndex !== -1) { xmlEndIndex = endIndex + endTag.length; break; } } if (xmlEndIndex === -1) { return null; } // Extract the XML content return pdfString.substring(xmlStartIndex, xmlEndIndex); } catch (error) { console.error('Error extracting XML from string:', error); return null; } } /** * Checks if an XML string is valid * @param xmlString XML string to check * @returns True if the XML is valid */ private isValidXml(xmlString: string): boolean { try { // Check if the XML string contains basic XML structure if (!xmlString.includes(' xmlString.includes(format)); if (!hasKnownFormat) { return false; } // Check if the XML string contains binary data or invalid characters const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005']; const hasBinaryData = invalidChars.some(char => xmlString.includes(char)); if (hasBinaryData) { return false; } // Check if the XML string is too short if (xmlString.length < 100) { return false; } return true; } catch (error) { console.error('Error validating XML:', error); return false; } } }