feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
@@ -0,0 +1,86 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Standard PDF XML extractor that extracts XML from embedded files
+ * Works with PDF/A-3 documents that follow the standard for embedding files
+ */
+export class StandardXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer using standard PDF/A-3 embedded files
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+
+      // Get the document's metadata dictionary
+      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
+      if (!(namesDictObj instanceof PDFDict)) {
+        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the embedded files dictionary
+      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
+      if (!(embeddedFilesDictObj instanceof PDFDict)) {
+        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the names array
+      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
+      if (!(filesSpecObj instanceof PDFArray)) {
+        console.warn('No files specified in EmbeddedFiles dictionary!');
+        return null;
+      }
+
+      // Try to find an XML file in the embedded files
+      for (let i = 0; i < filesSpecObj.size(); i += 2) {
+        const fileNameObj = filesSpecObj.lookup(i);
+        const fileSpecObj = filesSpecObj.lookup(i + 1);
+
+        if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
+          continue;
+        }
+
+        // Get the filename as string
+        const fileName = fileNameObj.decodeText();
+        
+        // Check if it's a known invoice XML file name
+        const isKnownFileName = this.knownFileNames.some(
+          knownName => fileName.toLowerCase() === knownName.toLowerCase()
+        );
+        
+        // Check if it's any XML file or has invoice-related keywords
+        const isXmlFile = fileName.toLowerCase().endsWith('.xml') || 
+                          fileName.toLowerCase().includes('zugferd') ||
+                          fileName.toLowerCase().includes('factur-x') ||
+                          fileName.toLowerCase().includes('xrechnung') ||
+                          fileName.toLowerCase().includes('invoice');
+        
+        if (isKnownFileName || isXmlFile) {
+          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
+          if (!(efDictObj instanceof PDFDict)) {
+            continue;
+          }
+
+          const fileStream = efDictObj.lookup(PDFName.of('F'));
+          if (fileStream instanceof PDFRawStream) {
+            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
+            if (xmlContent) {
+              return xmlContent;
+            }
+          }
+        }
+      }
+
+      console.warn('No valid XML found in embedded files');
+      return null;
+    } catch (error) {
+      console.error('Error in standard extraction:', error);
+      return null;
+    }
+  }
+}