update

2025-04-03 17:21:36 +00:00
parent 73617e46e4
commit b4a95de482
45 changed files with 4112 additions and 293 deletions
@@ -11,6 +11,34 @@ export class PDFExtractor {
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      // First try the standard extraction
+      const standardXml = await this.standardExtraction(pdfBuffer);
+      if (standardXml && this.isValidXml(standardXml)) {
+        return standardXml;
+      }
+
+      // If standard extraction fails, try alternative methods
+      const alternativeXml = await this.alternativeExtraction(pdfBuffer);
+      if (alternativeXml && this.isValidXml(alternativeXml)) {
+        return alternativeXml;
+      }
+
+      // If all extraction methods fail, return null
+      console.warn('All extraction methods failed, no valid XML found in PDF');
+      return null;
+    } catch (error) {
+      console.error('Error extracting XML from PDF:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Standard extraction method using PDF-lib
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      const pdfDoc = await PDFDocument.load(pdfBuffer);

@@ -50,13 +78,13 @@ export class PDFExtractor {

        // Get the filename as string
        const fileName = fileNameObj.toString();
-        
+
        // Check if it's an XML file (checking both extension and known standard filenames)
-        if (fileName.toLowerCase().includes('.xml') || 
+        if (fileName.toLowerCase().includes('.xml') ||
            fileName.toLowerCase().includes('factur-x') ||
            fileName.toLowerCase().includes('zugferd') ||
            fileName.toLowerCase().includes('xrechnung')) {
-            
+
          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
          if (!(efDictObj instanceof PDFDict)) {
            continue;
@@ -80,28 +108,180 @@ export class PDFExtractor {

      // Decompress and decode the XML content
      try {
+        // Try to decompress with pako
        const xmlCompressedBytes = xmlFile.getContents().buffer;
        const xmlBytes = pako.inflate(xmlCompressedBytes);
        const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);

-        console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
-        return xmlContent;
+        // Check if the XML content is valid
+        if (this.isValidXml(xmlContent)) {
+          console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
+          return xmlContent;
+        }
+
+        // If we get here, the XML content is not valid, try without decompression
+        console.log('Decompression succeeded but XML is not valid, trying without decompression...');
+        const rawXmlBytes = xmlFile.getContents();
+        const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);
+
+        if (this.isValidXml(rawXmlContent)) {
+          console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
+          return rawXmlContent;
+        }
+
+        // If we get here, neither the decompressed nor the raw XML content is valid
+        console.log('Neither decompressed nor raw XML content is valid');
+        return null;
      } catch (decompressError) {
-        // Try without decompression
+        // Decompression failed, try without decompression
        console.log('Decompression failed, trying without decompression...');
        try {
          const xmlBytes = xmlFile.getContents();
          const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
-          console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
-          return xmlContent;
+
+          if (this.isValidXml(xmlContent)) {
+            console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
+            return xmlContent;
+          }
+
+          // If we get here, the XML content is not valid
+          console.log('Uncompressed XML content is not valid');
+          return null;
        } catch (decodeError) {
          console.error('Error decoding XML content:', decodeError);
          return null;
        }
      }
    } catch (error) {
-      console.error('Error extracting or parsing embedded XML from PDF:', error);
+      console.error('Error in standard extraction:', error);
      return null;
    }
  }
+
+  /**
+   * Alternative extraction method using string search
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      // Convert buffer to string and look for XML patterns
+      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));
+
+      // Look for common XML patterns in the PDF
+      const xmlPatterns = [
+        /<\?xml[^>]*\?>/i,
+        /<CrossIndustryInvoice[^>]*>/i,
+        /<Invoice[^>]*>/i,
+        /<CreditNote[^>]*>/i,
+        /<rsm:CrossIndustryInvoice[^>]*>/i
+      ];
+
+      for (const pattern of xmlPatterns) {
+        const match = pdfString.match(pattern);
+        if (match) {
+          console.log(`Found XML pattern in PDF: ${match[0]}`);
+
+          // Try to extract the XML content
+          const xmlContent = this.extractXmlFromString(pdfString);
+          if (xmlContent) {
+            console.log('Successfully extracted XML from PDF string');
+            return xmlContent;
+          }
+        }
+      }
+
+      return null;
+    } catch (error) {
+      console.error('Error in alternative extraction:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Extracts XML from a string
+   * @param pdfString PDF string
+   * @returns XML content or null if not found
+   */
+  private extractXmlFromString(pdfString: string): string | null {
+    try {
+      // Look for XML start and end tags
+      const xmlStartIndex = pdfString.indexOf('<?xml');
+      if (xmlStartIndex === -1) {
+        return null;
+      }
+
+      // Try to find the end of the XML document
+      const possibleEndTags = [
+        '</CrossIndustryInvoice>',
+        '</Invoice>',
+        '</CreditNote>',
+        '</rsm:CrossIndustryInvoice>'
+      ];
+
+      let xmlEndIndex = -1;
+      for (const endTag of possibleEndTags) {
+        const endIndex = pdfString.indexOf(endTag);
+        if (endIndex !== -1) {
+          xmlEndIndex = endIndex + endTag.length;
+          break;
+        }
+      }
+
+      if (xmlEndIndex === -1) {
+        return null;
+      }
+
+      // Extract the XML content
+      return pdfString.substring(xmlStartIndex, xmlEndIndex);
+    } catch (error) {
+      console.error('Error extracting XML from string:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Checks if an XML string is valid
+   * @param xmlString XML string to check
+   * @returns True if the XML is valid
+   */
+  private isValidXml(xmlString: string): boolean {
+    try {
+      // Check if the XML string contains basic XML structure
+      if (!xmlString.includes('<?xml')) {
+        return false;
+      }
+
+      // Check if the XML string contains known invoice formats
+      const knownFormats = [
+        'CrossIndustryInvoice',
+        'Invoice',
+        'CreditNote',
+        'ubl:Invoice',
+        'ubl:CreditNote'
+      ];
+
+      const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
+      if (!hasKnownFormat) {
+        return false;
+      }
+
+      // Check if the XML string contains binary data or invalid characters
+      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
+      if (hasBinaryData) {
+        return false;
+      }
+
+      // Check if the XML string is too short
+      if (xmlString.length < 100) {
+        return false;
+      }
+
+      return true;
+    } catch (error) {
+      console.error('Error validating XML:', error);
+      return false;
+    }
+  }
 }