fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions
--- a/ts/formats/pdf/extractors/text.extractor.ts
+++ b/ts/formats/pdf/extractors/text.extractor.ts
@@ -6,50 +6,157 @@ import { BaseXMLExtractor } from './base.extractor.js';
 * Used as a fallback when other extraction methods fail
 */
 export class TextXMLExtractor extends BaseXMLExtractor {
+  // Maximum chunk size to process at once (4MB)
+  private readonly CHUNK_SIZE = 4 * 1024 * 1024;
+  
+  // Maximum number of chunks to check (effective 20MB search limit)
+  private readonly MAX_CHUNKS = 5;
+  
+  // Common XML patterns to look for
+  private readonly XML_PATTERNS = [
+    '<?xml', 
+    '<CrossIndustryInvoice',
+    '<CrossIndustryDocument',
+    '<Invoice',
+    '<CreditNote',
+    '<rsm:CrossIndustryInvoice',
+    '<rsm:CrossIndustryDocument',
+    '<ram:CrossIndustryDocument',
+    '<ubl:Invoice',
+    '<ubl:CreditNote',
+    '<FatturaElettronica'
+  ];
+
  /**
   * Extract XML from a PDF buffer by searching for XML patterns in the text
+   * Uses a chunked approach to handle large files efficiently
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
-      // Convert buffer to string and look for XML patterns
-      // Increase the search range to handle larger PDFs
-      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
-
-      // Look for common XML patterns in the PDF
-      const xmlPatterns = [
-        /<\?xml[^>]*\?>/i,
-        /<CrossIndustryInvoice[^>]*>/i,
-        /<CrossIndustryDocument[^>]*>/i,
-        /<Invoice[^>]*>/i,
-        /<CreditNote[^>]*>/i,
-        /<rsm:CrossIndustryInvoice[^>]*>/i,
-        /<rsm:CrossIndustryDocument[^>]*>/i,
-        /<ram:CrossIndustryDocument[^>]*>/i,
-        /<ubl:Invoice[^>]*>/i,
-        /<ubl:CreditNote[^>]*>/i
-      ];
-
-      for (const pattern of xmlPatterns) {
-        const match = pdfString.match(pattern);
-        if (match && match.index !== undefined) {
-          console.log(`Found XML pattern in PDF: ${match[0]}`);
-          
-          // Try to extract the XML content
-          const xmlContent = this.extractXmlFromString(pdfString, match.index);
-          if (xmlContent && this.isValidXml(xmlContent)) {
-            console.log('Successfully extracted XML from PDF text');
-            return xmlContent;
-          }
-        }
-      }
-
-      console.warn('No valid XML found in PDF text');
-      return null;
+      console.log('Attempting text-based XML extraction from PDF...');
+      
+      // Convert Buffer to Uint8Array if needed
+      const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
+      
+      // Try extracting XML using the chunked approach
+      return this.extractXmlFromBufferChunked(buffer);
    } catch (error) {
      console.error('Error in text-based extraction:', error);
      return null;
    }
  }
-}
+
+  /**
+   * Extract XML from buffer using a chunked approach
+   * This helps avoid memory issues with large PDFs
+   * @param buffer Buffer to search in
+   * @returns XML content or null if not found
+   */
+  private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
+    // Process the PDF in chunks
+    for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
+      const startPos = chunkIndex * this.CHUNK_SIZE;
+      if (startPos >= buffer.length) break;
+      
+      const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
+      const chunk = buffer.slice(startPos, endPos);
+      
+      // Try to extract XML from this chunk
+      const chunkResult = this.processChunk(chunk, startPos);
+      if (chunkResult) {
+        return chunkResult;
+      }
+    }
+    
+    console.warn('No valid XML found in any chunk of the PDF');
+    return null;
+  }
+
+  /**
+   * Process a single chunk of the PDF buffer
+   * @param chunk Chunk buffer to process
+   * @param chunkOffset Offset position of the chunk in the original buffer
+   * @returns XML content or null if not found
+   */
+  private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
+    try {
+      // First try UTF-8 encoding for this chunk
+      const utf8String = this.decodeBufferToString(chunk, 'utf-8');
+      let xmlContent = this.searchForXmlInString(utf8String);
+      
+      if (xmlContent) {
+        console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
+        return xmlContent;
+      }
+      
+      // If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
+      const latin1String = this.decodeBufferToString(chunk, 'latin1');
+      xmlContent = this.searchForXmlInString(latin1String);
+      
+      if (xmlContent) {
+        console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
+        return xmlContent;
+      }
+      
+      // No XML found in this chunk
+      return null;
+    } catch (error) {
+      console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
+      return null;
+    }
+  }
+
+  /**
+   * Safely decode a buffer to string using the specified encoding
+   * @param buffer Buffer to decode
+   * @param encoding Encoding to use ('utf-8' or 'latin1')
+   * @returns Decoded string
+   */
+  private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
+    try {
+      if (encoding === 'utf-8') {
+        return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
+      } else {
+        // For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
+        // This is more reliable for binary data than TextDecoder for legacy encodings
+        return Array.from(buffer)
+          .map(byte => String.fromCharCode(byte))
+          .join('');
+      }
+    } catch (error) {
+      console.warn(`Error decoding buffer using ${encoding}:`, error);
+      // Return empty string on error to allow processing to continue
+      return '';
+    }
+  }
+
+  /**
+   * Search for XML patterns in a string
+   * @param content String to search in
+   * @returns XML content or null if not found
+   */
+  private searchForXmlInString(content: string): string | null {
+    if (!content) return null;
+    
+    // Search for each XML pattern
+    for (const pattern of this.XML_PATTERNS) {
+      const patternIndex = content.indexOf(pattern);
+      if (patternIndex !== -1) {
+        console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
+        
+        // Try to extract the XML content starting from the pattern position
+        const xmlContent = this.extractXmlFromString(content, patternIndex);
+        
+        // Validate the extracted content
+        if (xmlContent && this.isValidXml(xmlContent)) {
+          console.log('Successfully extracted and validated XML from text');
+          return xmlContent;
+        }
+      }
+    }
+    
+    return null;
+  }
+}