feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
@@ -1,30 +1,54 @@
-import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
-import * as pako from 'pako';
+import {
+  BaseXMLExtractor,
+  StandardXMLExtractor,
+  AssociatedFilesExtractor,
+  TextXMLExtractor
+} from './extractors/index.js';

 /**
- * Class for extracting XML from PDF files
+ * Main PDF extractor class that orchestrates the extraction process
+ * Uses multiple specialized extractors in sequence to maximize success rate
 */
 export class PDFExtractor {
+  private extractors: BaseXMLExtractor[] = [];
+
  /**
-   * Extracts XML from a PDF buffer
+   * Constructor initializes the chain of extractors
+   */
+  constructor() {
+    // Add extractors in order of preference/likelihood of success
+    this.extractors.push(
+      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
+      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
+      new TextXMLExtractor()          // Text-based extraction (fallback)
+    );
+  }
+
+  /**
+   * Extract XML from a PDF buffer
+   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
-      // First try the standard extraction
-      const standardXml = await this.standardExtraction(pdfBuffer);
-      if (standardXml && this.isValidXml(standardXml)) {
-        return standardXml;
+      console.log('Starting XML extraction from PDF...');
+
+      // Try each extractor in sequence
+      for (const extractor of this.extractors) {
+        const extractorName = extractor.constructor.name;
+        console.log(`Trying extraction with ${extractorName}...`);
+
+        const xml = await extractor.extractXml(pdfBuffer);
+        if (xml) {
+          console.log(`Successfully extracted XML using ${extractorName}`);
+          return xml;
+        }
+
+        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

-      // If standard extraction fails, try alternative methods
-      const alternativeXml = await this.alternativeExtraction(pdfBuffer);
-      if (alternativeXml && this.isValidXml(alternativeXml)) {
-        return alternativeXml;
-      }
-
-      // If all extraction methods fail, return null
+      // If all extractors fail, return null
      console.warn('All extraction methods failed, no valid XML found in PDF');
      return null;
    } catch (error) {
@@ -33,255 +57,7 @@ export class PDFExtractor {
    }
  }

-  /**
-   * Standard extraction method using PDF-lib
-   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
-   */
-  private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
-    try {
-      const pdfDoc = await PDFDocument.load(pdfBuffer);

-      // Get the document's metadata dictionary
-      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
-      if (!(namesDictObj instanceof PDFDict)) {
-        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
-        return null;
-      }

-      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
-      if (!(embeddedFilesDictObj instanceof PDFDict)) {
-        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
-        return null;
-      }

-      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
-      if (!(filesSpecObj instanceof PDFArray)) {
-        console.warn('No files specified in EmbeddedFiles dictionary!');
-        return null;
-      }
-
-      // Try to find an XML file in the embedded files
-      let xmlFile: PDFRawStream | undefined;
-      let xmlFileName: string | undefined;
-
-      for (let i = 0; i < filesSpecObj.size(); i += 2) {
-        const fileNameObj = filesSpecObj.lookup(i);
-        const fileSpecObj = filesSpecObj.lookup(i + 1);
-
-        if (!(fileNameObj instanceof PDFString)) {
-          continue;
-        }
-        if (!(fileSpecObj instanceof PDFDict)) {
-          continue;
-        }
-
-        // Get the filename as string
-        const fileName = fileNameObj.toString();
-
-        // Check if it's an XML file (checking both extension and known standard filenames)
-        if (fileName.toLowerCase().includes('.xml') ||
-            fileName.toLowerCase().includes('factur-x') ||
-            fileName.toLowerCase().includes('zugferd') ||
-            fileName.toLowerCase().includes('xrechnung')) {
-
-          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
-          if (!(efDictObj instanceof PDFDict)) {
-            continue;
-          }
-
-          const maybeStream = efDictObj.lookup(PDFName.of('F'));
-          if (maybeStream instanceof PDFRawStream) {
-            // Found an XML file - save it
-            xmlFile = maybeStream;
-            xmlFileName = fileName;
-            break;
-          }
-        }
-      }
-
-      // If no XML file was found, return null
-      if (!xmlFile) {
-        console.warn('No embedded XML file found in the PDF!');
-        return null;
-      }
-
-      // Decompress and decode the XML content
-      try {
-        // Try to decompress with pako
-        const xmlCompressedBytes = xmlFile.getContents().buffer;
-        const xmlBytes = pako.inflate(xmlCompressedBytes);
-        const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
-
-        // Check if the XML content is valid
-        if (this.isValidXml(xmlContent)) {
-          console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
-          return xmlContent;
-        }
-
-        // If we get here, the XML content is not valid, try without decompression
-        console.log('Decompression succeeded but XML is not valid, trying without decompression...');
-        const rawXmlBytes = xmlFile.getContents();
-        const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);
-
-        if (this.isValidXml(rawXmlContent)) {
-          console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
-          return rawXmlContent;
-        }
-
-        // If we get here, neither the decompressed nor the raw XML content is valid
-        console.log('Neither decompressed nor raw XML content is valid');
-        return null;
-      } catch (decompressError) {
-        // Decompression failed, try without decompression
-        console.log('Decompression failed, trying without decompression...');
-        try {
-          const xmlBytes = xmlFile.getContents();
-          const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
-
-          if (this.isValidXml(xmlContent)) {
-            console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
-            return xmlContent;
-          }
-
-          // If we get here, the XML content is not valid
-          console.log('Uncompressed XML content is not valid');
-          return null;
-        } catch (decodeError) {
-          console.error('Error decoding XML content:', decodeError);
-          return null;
-        }
-      }
-    } catch (error) {
-      console.error('Error in standard extraction:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Alternative extraction method using string search
-   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
-   */
-  private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
-    try {
-      // Convert buffer to string and look for XML patterns
-      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));
-
-      // Look for common XML patterns in the PDF
-      const xmlPatterns = [
-        /<\?xml[^>]*\?>/i,
-        /<CrossIndustryInvoice[^>]*>/i,
-        /<Invoice[^>]*>/i,
-        /<CreditNote[^>]*>/i,
-        /<rsm:CrossIndustryInvoice[^>]*>/i
-      ];
-
-      for (const pattern of xmlPatterns) {
-        const match = pdfString.match(pattern);
-        if (match) {
-          console.log(`Found XML pattern in PDF: ${match[0]}`);
-
-          // Try to extract the XML content
-          const xmlContent = this.extractXmlFromString(pdfString);
-          if (xmlContent) {
-            console.log('Successfully extracted XML from PDF string');
-            return xmlContent;
-          }
-        }
-      }
-
-      return null;
-    } catch (error) {
-      console.error('Error in alternative extraction:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Extracts XML from a string
-   * @param pdfString PDF string
-   * @returns XML content or null if not found
-   */
-  private extractXmlFromString(pdfString: string): string | null {
-    try {
-      // Look for XML start and end tags
-      const xmlStartIndex = pdfString.indexOf('<?xml');
-      if (xmlStartIndex === -1) {
-        return null;
-      }
-
-      // Try to find the end of the XML document
-      const possibleEndTags = [
-        '</CrossIndustryInvoice>',
-        '</Invoice>',
-        '</CreditNote>',
-        '</rsm:CrossIndustryInvoice>'
-      ];
-
-      let xmlEndIndex = -1;
-      for (const endTag of possibleEndTags) {
-        const endIndex = pdfString.indexOf(endTag);
-        if (endIndex !== -1) {
-          xmlEndIndex = endIndex + endTag.length;
-          break;
-        }
-      }
-
-      if (xmlEndIndex === -1) {
-        return null;
-      }
-
-      // Extract the XML content
-      return pdfString.substring(xmlStartIndex, xmlEndIndex);
-    } catch (error) {
-      console.error('Error extracting XML from string:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Checks if an XML string is valid
-   * @param xmlString XML string to check
-   * @returns True if the XML is valid
-   */
-  private isValidXml(xmlString: string): boolean {
-    try {
-      // Check if the XML string contains basic XML structure
-      if (!xmlString.includes('<?xml')) {
-        return false;
-      }
-
-      // Check if the XML string contains known invoice formats
-      const knownFormats = [
-        'CrossIndustryInvoice',
-        'Invoice',
-        'CreditNote',
-        'ubl:Invoice',
-        'ubl:CreditNote'
-      ];
-
-      const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
-      if (!hasKnownFormat) {
-        return false;
-      }
-
-      // Check if the XML string contains binary data or invalid characters
-      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
-      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
-      if (hasBinaryData) {
-        return false;
-      }
-
-      // Check if the XML string is too short
-      if (xmlString.length < 100) {
-        return false;
-      }
-
-      return true;
-    } catch (error) {
-      console.error('Error validating XML:', error);
-      return false;
-    }
-  }
 }