fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions
--- a/ts/formats/pdf/extractors/base.extractor.ts
+++ b/ts/formats/pdf/extractors/base.extractor.ts
@@ -11,7 +11,10 @@ export abstract class BaseXMLExtractor {
    'factur-x.xml',
    'zugferd-invoice.xml',
    'ZUGFeRD-invoice.xml',
-    'xrechnung.xml'
+    'xrechnung.xml',
+    'ubl-invoice.xml',
+    'invoice.xml',
+    'metadata.xml'
  ];

  /**
@@ -32,7 +35,8 @@ export abstract class BaseXMLExtractor {
    'urn:zugferd',
    'urn:factur-x',
    'factur-x.eu',
-    'ZUGFeRD'
+    'ZUGFeRD',
+    'FatturaElettronica'
  ];

  /**
@@ -47,7 +51,8 @@ export abstract class BaseXMLExtractor {
    '</rsm:CrossIndustryDocument>',
    '</ram:CrossIndustryDocument>',
    '</ubl:Invoice>',
-    '</ubl:CreditNote>'
+    '</ubl:CreditNote>',
+    '</FatturaElettronica>'
  ];

  /**
@@ -69,21 +74,19 @@ export abstract class BaseXMLExtractor {
        return false;
      }

-      // Check if it starts with XML declaration
-      if (!xmlString.includes('<?xml')) {
+      // Check if it starts with XML declaration or a valid element
+      if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
        return false;
      }

      // Check if the XML string contains known invoice formats
-      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
+      const hasKnownFormat = this.hasKnownFormat(xmlString);
      if (!hasKnownFormat) {
        return false;
      }

      // Check if the XML string contains binary data or invalid characters
-      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
-      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
-      if (hasBinaryData) {
+      if (this.hasBinaryData(xmlString)) {
        return false;
      }

@@ -92,6 +95,11 @@ export abstract class BaseXMLExtractor {
        return false;
      }

+      // Check if XML has a proper structure (contains both opening and closing tags)
+      if (!this.hasProperXmlStructure(xmlString)) {
+        return false;
+      }
+
      return true;
    } catch (error) {
      console.error('Error validating XML:', error);
@@ -99,6 +107,85 @@ export abstract class BaseXMLExtractor {
    }
  }

+  /**
+   * Check if the XML string contains a known element
+   * @param xmlString XML string to check
+   * @returns True if the XML contains a known element
+   */
+  protected hasKnownXmlElement(xmlString: string): boolean {
+    for (const format of this.knownFormats) {
+      // Check for opening tag of format
+      if (xmlString.includes(`<${format}`)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Check if the XML string contains a known format
+   * @param xmlString XML string to check
+   * @returns True if the XML contains a known format
+   */
+  protected hasKnownFormat(xmlString: string): boolean {
+    for (const format of this.knownFormats) {
+      if (xmlString.includes(format)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Check if the XML string has a proper structure
+   * @param xmlString XML string to check
+   * @returns True if the XML has a proper structure
+   */
+  protected hasProperXmlStructure(xmlString: string): boolean {
+    // Check for at least one matching opening and closing tag
+    for (const endTag of this.knownEndTags) {
+      const startTag = endTag.replace('/', '');
+      if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
+        return true;
+      }
+    }
+    
+    // If no specific tag is found but it has a basic XML structure
+    return (
+      (xmlString.includes('<?xml') && xmlString.includes('?>')) ||
+      (xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
+    );
+  }
+
+  /**
+   * Check if the XML string contains binary data
+   * @param xmlString XML string to check
+   * @returns True if the XML contains binary data
+   */
+  protected hasBinaryData(xmlString: string): boolean {
+    // Check for common binary data indicators
+    const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+    const consecutiveNulls = '\u0000\u0000\u0000';
+    
+    // Check for control characters that shouldn't be in XML
+    if (binaryChars.some(char => xmlString.includes(char))) {
+      return true;
+    }
+    
+    // Check for consecutive null bytes which indicate binary data
+    if (xmlString.includes(consecutiveNulls)) {
+      return true;
+    }
+    
+    // Check for high concentration of non-printable characters
+    const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
+    if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
+      return true;
+    }
+    
+    return false;
+  }
+
  /**
   * Extract XML from a string
   * @param text Text to extract XML from
@@ -108,9 +195,22 @@ export abstract class BaseXMLExtractor {
  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
    try {
      // Find the start of the XML document
-      const xmlStartIndex = text.indexOf('<?xml', startIndex);
+      let xmlStartIndex = text.indexOf('<?xml', startIndex);
+      
+      // If no XML declaration, try to find known elements
      if (xmlStartIndex === -1) {
-        return null;
+        for (const format of this.knownFormats) {
+          const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
+          if (formatStartIndex !== -1) {
+            xmlStartIndex = formatStartIndex;
+            break;
+          }
+        }
+        
+        // Still didn't find any start marker
+        if (xmlStartIndex === -1) {
+          return null;
+        }
      }

      // Try to find the end of the XML document
@@ -123,12 +223,26 @@ export abstract class BaseXMLExtractor {
        }
      }

+      // If no known end tag found, try to use a heuristic approach
      if (xmlEndIndex === -1) {
-        return null;
+        // Try to find the last closing tag
+        const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
+        if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
+          xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
+        } else {
+          return null;
+        }
      }

      // Extract the XML content
-      return text.substring(xmlStartIndex, xmlEndIndex);
+      const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
+      
+      // Validate the extracted content
+      if (this.isValidXml(xmlContent)) {
+        return xmlContent;
+      }
+      
+      return null;
    } catch (error) {
      console.error('Error extracting XML from string:', error);
      return null;
@@ -143,34 +257,99 @@ export abstract class BaseXMLExtractor {
   */
  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
    try {
-      // Try to decompress with pako
-      const compressedBytes = stream.getContents().buffer;
+      // Get the raw bytes from the stream
+      const rawBytes = stream.getContents();
+      
+      // First try without decompression (in case the content is not compressed)
+      let xmlContent = this.tryDecodeBuffer(rawBytes);
+      if (xmlContent && this.isValidXml(xmlContent)) {
+        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
+        return xmlContent;
+      }
+      
+      // Try with decompression
      try {
-        const decompressedBytes = pako.inflate(compressedBytes);
-        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
-
-        if (this.isValidXml(xmlContent)) {
-          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
-          return xmlContent;
+        const decompressedBytes = this.tryDecompress(rawBytes);
+        if (decompressedBytes) {
+          xmlContent = this.tryDecodeBuffer(decompressedBytes);
+          if (xmlContent && this.isValidXml(xmlContent)) {
+            console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
+            return xmlContent;
+          }
        }
      } catch (decompressError) {
-        // Decompression failed, try without decompression
-        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
+        console.log(`Decompression failed for ${fileName}: ${decompressError}`);
      }
-
-      // Try without decompression
-      const rawBytes = stream.getContents();
-      const rawContent = new TextDecoder('utf-8').decode(rawBytes);
-
-      if (this.isValidXml(rawContent)) {
-        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
-        return rawContent;
-      }
-
+      
      return null;
    } catch (error) {
      console.error('Error extracting XML from stream:', error);
      return null;
    }
  }
-}
+  
+  /**
+   * Try to decompress a buffer using different methods
+   * @param buffer Buffer to decompress
+   * @returns Decompressed buffer or null if decompression failed
+   */
+  protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
+    try {
+      // Try pako inflate (for deflate/zlib compression)
+      return pako.inflate(buffer);
+    } catch (error) {
+      // If pako fails, try other methods if needed
+      console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
+      return null;
+    }
+  }
+  
+  /**
+   * Try to decode a buffer to a string using different encodings
+   * @param buffer Buffer to decode
+   * @returns Decoded string or null if decoding failed
+   */
+  protected tryDecodeBuffer(buffer: Uint8Array): string | null {
+    try {
+      // Try UTF-8 first
+      let content = new TextDecoder('utf-8').decode(buffer);
+      if (this.isPlausibleXml(content)) {
+        return content;
+      }
+      
+      // Try ISO-8859-1 (Latin1)
+      content = this.decodeLatin1(buffer);
+      if (this.isPlausibleXml(content)) {
+        return content;
+      }
+      
+      return null;
+    } catch (error) {
+      console.warn('Error decoding buffer:', error);
+      return null;
+    }
+  }
+  
+  /**
+   * Decode a buffer using ISO-8859-1 (Latin1) encoding
+   * @param buffer Buffer to decode
+   * @returns Decoded string
+   */
+  protected decodeLatin1(buffer: Uint8Array): string {
+    return Array.from(buffer)
+      .map(byte => String.fromCharCode(byte))
+      .join('');
+  }
+  
+  /**
+   * Check if a string is plausibly XML (quick check before validation)
+   * @param content String to check
+   * @returns True if the string is plausibly XML
+   */
+  protected isPlausibleXml(content: string): boolean {
+    return content.includes('<') && 
+           content.includes('>') && 
+           (content.includes('<?xml') || 
+            this.knownFormats.some(format => content.includes(format)));
+  }
+}