feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
@@ -0,0 +1,55 @@
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Text-based XML extractor for PDF documents
+ * Extracts XML by searching for XML patterns in the PDF text
+ * Used as a fallback when other extraction methods fail
+ */
+export class TextXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer by searching for XML patterns in the text
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      // Convert buffer to string and look for XML patterns
+      // Increase the search range to handle larger PDFs
+      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
+
+      // Look for common XML patterns in the PDF
+      const xmlPatterns = [
+        /<\?xml[^>]*\?>/i,
+        /<CrossIndustryInvoice[^>]*>/i,
+        /<CrossIndustryDocument[^>]*>/i,
+        /<Invoice[^>]*>/i,
+        /<CreditNote[^>]*>/i,
+        /<rsm:CrossIndustryInvoice[^>]*>/i,
+        /<rsm:CrossIndustryDocument[^>]*>/i,
+        /<ram:CrossIndustryDocument[^>]*>/i,
+        /<ubl:Invoice[^>]*>/i,
+        /<ubl:CreditNote[^>]*>/i
+      ];
+
+      for (const pattern of xmlPatterns) {
+        const match = pdfString.match(pattern);
+        if (match && match.index !== undefined) {
+          console.log(`Found XML pattern in PDF: ${match[0]}`);
+          
+          // Try to extract the XML content
+          const xmlContent = this.extractXmlFromString(pdfString, match.index);
+          if (xmlContent && this.isValidXml(xmlContent)) {
+            console.log('Successfully extracted XML from PDF text');
+            return xmlContent;
+          }
+        }
+      }
+
+      console.warn('No valid XML found in PDF text');
+      return null;
+    } catch (error) {
+      console.error('Error in text-based extraction:', error);
+      return null;
+    }
+  }
+}