import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js'; /** * Base class for PDF XML extractors with common functionality */ export abstract class BaseXMLExtractor { /** * Known XML file names for different invoice formats */ protected readonly knownFileNames = [ 'factur-x.xml', 'zugferd-invoice.xml', 'ZUGFeRD-invoice.xml', 'xrechnung.xml' ]; /** * Known XML formats to validate extracted content */ protected readonly knownFormats = [ 'CrossIndustryInvoice', 'CrossIndustryDocument', 'Invoice', 'CreditNote', 'ubl:Invoice', 'ubl:CreditNote', 'rsm:CrossIndustryInvoice', 'rsm:CrossIndustryDocument', 'ram:CrossIndustryDocument', 'urn:un:unece:uncefact', 'urn:ferd:CrossIndustryDocument', 'urn:zugferd', 'urn:factur-x', 'factur-x.eu', 'ZUGFeRD' ]; /** * Known XML end tags for extracting content from strings */ protected readonly knownEndTags = [ '', '', '', '', '', '', '', '', '' ]; /** * Extract XML from a PDF buffer * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise; /** * Check if an XML string is valid * @param xmlString XML string to check * @returns True if the XML is valid */ protected isValidXml(xmlString: string): boolean { try { // Basic checks for XML validity if (!xmlString || typeof xmlString !== 'string') { return false; } // Check if it starts with XML declaration if (!xmlString.includes(' xmlString.includes(format)); if (!hasKnownFormat) { return false; } // Check if the XML string contains binary data or invalid characters const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005']; const hasBinaryData = invalidChars.some(char => xmlString.includes(char)); if (hasBinaryData) { return false; } // Check if the XML string is too short if (xmlString.length < 100) { return false; } return true; } catch (error) { console.error('Error validating XML:', error); return false; } } /** * Extract XML from a string * @param text Text to extract XML from * @param startIndex Index to start extraction from * @returns XML content or null if not found */ protected extractXmlFromString(text: string, startIndex: number = 0): string | null { try { // Find the start of the XML document const xmlStartIndex = text.indexOf(' { try { // Try to decompress with pako const compressedBytes = stream.getContents().buffer; try { const decompressedBytes = pako.inflate(compressedBytes); const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes); if (this.isValidXml(xmlContent)) { console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`); return xmlContent; } } catch (decompressError) { // Decompression failed, try without decompression console.log(`Decompression failed for ${fileName}, trying without decompression...`); } // Try without decompression const rawBytes = stream.getContents(); const rawContent = new TextDecoder('utf-8').decode(rawBytes); if (this.isValidXml(rawContent)) { console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`); return rawContent; } return null; } catch (error) { console.error('Error extracting XML from stream:', error); return null; } } }