feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
--- a/ts/formats/pdf/extractors/base.extractor.ts
+++ b/ts/formats/pdf/extractors/base.extractor.ts
@@ -0,0 +1,177 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import * as pako from 'pako';
+
+/**
+ * Base class for PDF XML extractors with common functionality
+ */
+export abstract class BaseXMLExtractor {
+  /**
+   * Known XML file names for different invoice formats
+   */
+  protected readonly knownFileNames = [
+    'factur-x.xml',
+    'zugferd-invoice.xml',
+    'ZUGFeRD-invoice.xml',
+    'xrechnung.xml'
+  ];
+
+  /**
+   * Known XML formats to validate extracted content
+   */
+  protected readonly knownFormats = [
+    'CrossIndustryInvoice',
+    'CrossIndustryDocument',
+    'Invoice',
+    'CreditNote',
+    'ubl:Invoice',
+    'ubl:CreditNote',
+    'rsm:CrossIndustryInvoice',
+    'rsm:CrossIndustryDocument',
+    'ram:CrossIndustryDocument',
+    'urn:un:unece:uncefact',
+    'urn:ferd:CrossIndustryDocument',
+    'urn:zugferd',
+    'urn:factur-x',
+    'factur-x.eu',
+    'ZUGFeRD'
+  ];
+
+  /**
+   * Known XML end tags for extracting content from strings
+   */
+  protected readonly knownEndTags = [
+    '</CrossIndustryInvoice>',
+    '</CrossIndustryDocument>',
+    '</Invoice>',
+    '</CreditNote>',
+    '</rsm:CrossIndustryInvoice>',
+    '</rsm:CrossIndustryDocument>',
+    '</ram:CrossIndustryDocument>',
+    '</ubl:Invoice>',
+    '</ubl:CreditNote>'
+  ];
+
+  /**
+   * Extract XML from a PDF buffer
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
+
+  /**
+   * Check if an XML string is valid
+   * @param xmlString XML string to check
+   * @returns True if the XML is valid
+   */
+  protected isValidXml(xmlString: string): boolean {
+    try {
+      // Basic checks for XML validity
+      if (!xmlString || typeof xmlString !== 'string') {
+        return false;
+      }
+
+      // Check if it starts with XML declaration
+      if (!xmlString.includes('<?xml')) {
+        return false;
+      }
+
+      // Check if the XML string contains known invoice formats
+      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
+      if (!hasKnownFormat) {
+        return false;
+      }
+
+      // Check if the XML string contains binary data or invalid characters
+      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
+      if (hasBinaryData) {
+        return false;
+      }
+
+      // Check if the XML string is too short
+      if (xmlString.length < 100) {
+        return false;
+      }
+
+      return true;
+    } catch (error) {
+      console.error('Error validating XML:', error);
+      return false;
+    }
+  }
+
+  /**
+   * Extract XML from a string
+   * @param text Text to extract XML from
+   * @param startIndex Index to start extraction from
+   * @returns XML content or null if not found
+   */
+  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
+    try {
+      // Find the start of the XML document
+      const xmlStartIndex = text.indexOf('<?xml', startIndex);
+      if (xmlStartIndex === -1) {
+        return null;
+      }
+
+      // Try to find the end of the XML document
+      let xmlEndIndex = -1;
+      for (const endTag of this.knownEndTags) {
+        const endIndex = text.indexOf(endTag, xmlStartIndex);
+        if (endIndex !== -1) {
+          xmlEndIndex = endIndex + endTag.length;
+          break;
+        }
+      }
+
+      if (xmlEndIndex === -1) {
+        return null;
+      }
+
+      // Extract the XML content
+      return text.substring(xmlStartIndex, xmlEndIndex);
+    } catch (error) {
+      console.error('Error extracting XML from string:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Decompress and decode XML content from a PDF stream
+   * @param stream PDF stream containing XML data
+   * @param fileName Name of the file (for logging)
+   * @returns XML content or null if not valid
+   */
+  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
+    try {
+      // Try to decompress with pako
+      const compressedBytes = stream.getContents().buffer;
+      try {
+        const decompressedBytes = pako.inflate(compressedBytes);
+        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
+        
+        if (this.isValidXml(xmlContent)) {
+          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
+          return xmlContent;
+        }
+      } catch (decompressError) {
+        // Decompression failed, try without decompression
+        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
+      }
+      
+      // Try without decompression
+      const rawBytes = stream.getContents();
+      const rawContent = new TextDecoder('utf-8').decode(rawBytes);
+      
+      if (this.isValidXml(rawContent)) {
+        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
+        return rawContent;
+      }
+      
+      return null;
+    } catch (error) {
+      console.error('Error extracting XML from stream:', error);
+      return null;
+    }
+  }
+}