feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
@@ -0,0 +1,78 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Associated files extractor for PDF/A-3 documents
+ * Extracts XML from associated files (AF entry in the catalog)
+ * Particularly useful for ZUGFeRD v1 and some Factur-X documents
+ */
+export class AssociatedFilesExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer using associated files
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+      
+      // Try to find associated files via the AF entry in the catalog
+      const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
+      if (!(afArray instanceof PDFArray)) {
+        console.warn('No AF (Associated Files) entry found in PDF catalog');
+        return null;
+      }
+      
+      // Process each associated file
+      for (let i = 0; i < afArray.size(); i++) {
+        const fileSpec = afArray.lookup(i);
+        if (!(fileSpec instanceof PDFDict)) {
+          continue;
+        }
+        
+        // Get the file name
+        const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
+        if (!(fileNameObj instanceof PDFString)) {
+          continue;
+        }
+        
+        const fileName = fileNameObj.decodeText();
+        
+        // Check if it's a known invoice XML file name
+        const isKnownFileName = this.knownFileNames.some(
+          knownName => fileName.toLowerCase() === knownName.toLowerCase()
+        );
+        
+        // Check if it's any XML file or has invoice-related keywords
+        const isXmlFile = fileName.toLowerCase().endsWith('.xml') || 
+                          fileName.toLowerCase().includes('zugferd') ||
+                          fileName.toLowerCase().includes('factur-x') ||
+                          fileName.toLowerCase().includes('xrechnung') ||
+                          fileName.toLowerCase().includes('invoice');
+        
+        if (isKnownFileName || isXmlFile) {
+          // Get the embedded file dictionary
+          const efDict = fileSpec.lookup(PDFName.of('EF'));
+          if (!(efDict instanceof PDFDict)) {
+            continue;
+          }
+          
+          // Get the file stream
+          const fileStream = efDict.lookup(PDFName.of('F'));
+          if (fileStream instanceof PDFRawStream) {
+            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
+            if (xmlContent) {
+              return xmlContent;
+            }
+          }
+        }
+      }
+      
+      console.warn('No valid XML found in associated files');
+      return null;
+    } catch (error) {
+      console.error('Error in associated files extraction:', error);
+      return null;
+    }
+  }
+}
@@ -0,0 +1,177 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import * as pako from 'pako';
+
+/**
+ * Base class for PDF XML extractors with common functionality
+ */
+export abstract class BaseXMLExtractor {
+  /**
+   * Known XML file names for different invoice formats
+   */
+  protected readonly knownFileNames = [
+    'factur-x.xml',
+    'zugferd-invoice.xml',
+    'ZUGFeRD-invoice.xml',
+    'xrechnung.xml'
+  ];
+
+  /**
+   * Known XML formats to validate extracted content
+   */
+  protected readonly knownFormats = [
+    'CrossIndustryInvoice',
+    'CrossIndustryDocument',
+    'Invoice',
+    'CreditNote',
+    'ubl:Invoice',
+    'ubl:CreditNote',
+    'rsm:CrossIndustryInvoice',
+    'rsm:CrossIndustryDocument',
+    'ram:CrossIndustryDocument',
+    'urn:un:unece:uncefact',
+    'urn:ferd:CrossIndustryDocument',
+    'urn:zugferd',
+    'urn:factur-x',
+    'factur-x.eu',
+    'ZUGFeRD'
+  ];
+
+  /**
+   * Known XML end tags for extracting content from strings
+   */
+  protected readonly knownEndTags = [
+    '</CrossIndustryInvoice>',
+    '</CrossIndustryDocument>',
+    '</Invoice>',
+    '</CreditNote>',
+    '</rsm:CrossIndustryInvoice>',
+    '</rsm:CrossIndustryDocument>',
+    '</ram:CrossIndustryDocument>',
+    '</ubl:Invoice>',
+    '</ubl:CreditNote>'
+  ];
+
+  /**
+   * Extract XML from a PDF buffer
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
+
+  /**
+   * Check if an XML string is valid
+   * @param xmlString XML string to check
+   * @returns True if the XML is valid
+   */
+  protected isValidXml(xmlString: string): boolean {
+    try {
+      // Basic checks for XML validity
+      if (!xmlString || typeof xmlString !== 'string') {
+        return false;
+      }
+
+      // Check if it starts with XML declaration
+      if (!xmlString.includes('<?xml')) {
+        return false;
+      }
+
+      // Check if the XML string contains known invoice formats
+      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
+      if (!hasKnownFormat) {
+        return false;
+      }
+
+      // Check if the XML string contains binary data or invalid characters
+      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
+      if (hasBinaryData) {
+        return false;
+      }
+
+      // Check if the XML string is too short
+      if (xmlString.length < 100) {
+        return false;
+      }
+
+      return true;
+    } catch (error) {
+      console.error('Error validating XML:', error);
+      return false;
+    }
+  }
+
+  /**
+   * Extract XML from a string
+   * @param text Text to extract XML from
+   * @param startIndex Index to start extraction from
+   * @returns XML content or null if not found
+   */
+  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
+    try {
+      // Find the start of the XML document
+      const xmlStartIndex = text.indexOf('<?xml', startIndex);
+      if (xmlStartIndex === -1) {
+        return null;
+      }
+
+      // Try to find the end of the XML document
+      let xmlEndIndex = -1;
+      for (const endTag of this.knownEndTags) {
+        const endIndex = text.indexOf(endTag, xmlStartIndex);
+        if (endIndex !== -1) {
+          xmlEndIndex = endIndex + endTag.length;
+          break;
+        }
+      }
+
+      if (xmlEndIndex === -1) {
+        return null;
+      }
+
+      // Extract the XML content
+      return text.substring(xmlStartIndex, xmlEndIndex);
+    } catch (error) {
+      console.error('Error extracting XML from string:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Decompress and decode XML content from a PDF stream
+   * @param stream PDF stream containing XML data
+   * @param fileName Name of the file (for logging)
+   * @returns XML content or null if not valid
+   */
+  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
+    try {
+      // Try to decompress with pako
+      const compressedBytes = stream.getContents().buffer;
+      try {
+        const decompressedBytes = pako.inflate(compressedBytes);
+        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
+        
+        if (this.isValidXml(xmlContent)) {
+          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
+          return xmlContent;
+        }
+      } catch (decompressError) {
+        // Decompression failed, try without decompression
+        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
+      }
+      
+      // Try without decompression
+      const rawBytes = stream.getContents();
+      const rawContent = new TextDecoder('utf-8').decode(rawBytes);
+      
+      if (this.isValidXml(rawContent)) {
+        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
+        return rawContent;
+      }
+      
+      return null;
+    } catch (error) {
+      console.error('Error extracting XML from stream:', error);
+      return null;
+    }
+  }
+}
@@ -0,0 +1,4 @@
+export * from './base.extractor.js';
+export * from './standard.extractor.js';
+export * from './associated.extractor.js';
+export * from './text.extractor.js';
@@ -0,0 +1,86 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Standard PDF XML extractor that extracts XML from embedded files
+ * Works with PDF/A-3 documents that follow the standard for embedding files
+ */
+export class StandardXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer using standard PDF/A-3 embedded files
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+
+      // Get the document's metadata dictionary
+      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
+      if (!(namesDictObj instanceof PDFDict)) {
+        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the embedded files dictionary
+      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
+      if (!(embeddedFilesDictObj instanceof PDFDict)) {
+        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the names array
+      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
+      if (!(filesSpecObj instanceof PDFArray)) {
+        console.warn('No files specified in EmbeddedFiles dictionary!');
+        return null;
+      }
+
+      // Try to find an XML file in the embedded files
+      for (let i = 0; i < filesSpecObj.size(); i += 2) {
+        const fileNameObj = filesSpecObj.lookup(i);
+        const fileSpecObj = filesSpecObj.lookup(i + 1);
+
+        if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
+          continue;
+        }
+
+        // Get the filename as string
+        const fileName = fileNameObj.decodeText();
+        
+        // Check if it's a known invoice XML file name
+        const isKnownFileName = this.knownFileNames.some(
+          knownName => fileName.toLowerCase() === knownName.toLowerCase()
+        );
+        
+        // Check if it's any XML file or has invoice-related keywords
+        const isXmlFile = fileName.toLowerCase().endsWith('.xml') || 
+                          fileName.toLowerCase().includes('zugferd') ||
+                          fileName.toLowerCase().includes('factur-x') ||
+                          fileName.toLowerCase().includes('xrechnung') ||
+                          fileName.toLowerCase().includes('invoice');
+        
+        if (isKnownFileName || isXmlFile) {
+          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
+          if (!(efDictObj instanceof PDFDict)) {
+            continue;
+          }
+
+          const fileStream = efDictObj.lookup(PDFName.of('F'));
+          if (fileStream instanceof PDFRawStream) {
+            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
+            if (xmlContent) {
+              return xmlContent;
+            }
+          }
+        }
+      }
+
+      console.warn('No valid XML found in embedded files');
+      return null;
+    } catch (error) {
+      console.error('Error in standard extraction:', error);
+      return null;
+    }
+  }
+}
@@ -0,0 +1,55 @@
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Text-based XML extractor for PDF documents
+ * Extracts XML by searching for XML patterns in the PDF text
+ * Used as a fallback when other extraction methods fail
+ */
+export class TextXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer by searching for XML patterns in the text
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      // Convert buffer to string and look for XML patterns
+      // Increase the search range to handle larger PDFs
+      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
+
+      // Look for common XML patterns in the PDF
+      const xmlPatterns = [
+        /<\?xml[^>]*\?>/i,
+        /<CrossIndustryInvoice[^>]*>/i,
+        /<CrossIndustryDocument[^>]*>/i,
+        /<Invoice[^>]*>/i,
+        /<CreditNote[^>]*>/i,
+        /<rsm:CrossIndustryInvoice[^>]*>/i,
+        /<rsm:CrossIndustryDocument[^>]*>/i,
+        /<ram:CrossIndustryDocument[^>]*>/i,
+        /<ubl:Invoice[^>]*>/i,
+        /<ubl:CreditNote[^>]*>/i
+      ];
+
+      for (const pattern of xmlPatterns) {
+        const match = pdfString.match(pattern);
+        if (match && match.index !== undefined) {
+          console.log(`Found XML pattern in PDF: ${match[0]}`);
+          
+          // Try to extract the XML content
+          const xmlContent = this.extractXmlFromString(pdfString, match.index);
+          if (xmlContent && this.isValidXml(xmlContent)) {
+            console.log('Successfully extracted XML from PDF text');
+            return xmlContent;
+          }
+        }
+      }
+
+      console.warn('No valid XML found in PDF text');
+      return null;
+    } catch (error) {
+      console.error('Error in text-based extraction:', error);
+      return null;
+    }
+  }
+}