xinvoice/ts/formats/pdf/extractors/text.extractor.ts

import { BaseXMLExtractor } from './base.extractor.js';

/**
 * Text-based XML extractor for PDF documents
 * Extracts XML by searching for XML patterns in the PDF text
 * Used as a fallback when other extraction methods fail
 */
export class TextXMLExtractor extends BaseXMLExtractor {
  // Maximum chunk size to process at once (4MB)
  private readonly CHUNK_SIZE = 4 * 1024 * 1024;
  
  // Maximum number of chunks to check (effective 20MB search limit)
  private readonly MAX_CHUNKS = 5;
  
  // Common XML patterns to look for
  private readonly XML_PATTERNS = [
    '<?xml', 
    '<CrossIndustryInvoice',
    '<CrossIndustryDocument',
    '<Invoice',
    '<CreditNote',
    '<rsm:CrossIndustryInvoice',
    '<rsm:CrossIndustryDocument',
    '<ram:CrossIndustryDocument',
    '<ubl:Invoice',
    '<ubl:CreditNote',
    '<FatturaElettronica'
  ];

  /**
   * Extract XML from a PDF buffer by searching for XML patterns in the text
   * Uses a chunked approach to handle large files efficiently
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      console.log('Attempting text-based XML extraction from PDF...');
      
      // Convert Buffer to Uint8Array if needed
      const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
      
      // Try extracting XML using the chunked approach
      return this.extractXmlFromBufferChunked(buffer);
    } catch (error) {
      console.error('Error in text-based extraction:', error);
      return null;
    }
  }

  /**
   * Extract XML from buffer using a chunked approach
   * This helps avoid memory issues with large PDFs
   * @param buffer Buffer to search in
   * @returns XML content or null if not found
   */
  private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
    // Process the PDF in chunks
    for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
      const startPos = chunkIndex * this.CHUNK_SIZE;
      if (startPos >= buffer.length) break;
      
      const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
      const chunk = buffer.slice(startPos, endPos);
      
      // Try to extract XML from this chunk
      const chunkResult = this.processChunk(chunk, startPos);
      if (chunkResult) {
        return chunkResult;
      }
    }
    
    console.warn('No valid XML found in any chunk of the PDF');
    return null;
  }

  /**
   * Process a single chunk of the PDF buffer
   * @param chunk Chunk buffer to process
   * @param chunkOffset Offset position of the chunk in the original buffer
   * @returns XML content or null if not found
   */
  private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
    try {
      // First try UTF-8 encoding for this chunk
      const utf8String = this.decodeBufferToString(chunk, 'utf-8');
      let xmlContent = this.searchForXmlInString(utf8String);
      
      if (xmlContent) {
        console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
        return xmlContent;
      }
      
      // If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
      const latin1String = this.decodeBufferToString(chunk, 'latin1');
      xmlContent = this.searchForXmlInString(latin1String);
      
      if (xmlContent) {
        console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
        return xmlContent;
      }
      
      // No XML found in this chunk
      return null;
    } catch (error) {
      console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
      return null;
    }
  }

  /**
   * Safely decode a buffer to string using the specified encoding
   * @param buffer Buffer to decode
   * @param encoding Encoding to use ('utf-8' or 'latin1')
   * @returns Decoded string
   */
  private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
    try {
      if (encoding === 'utf-8') {
        return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
      } else {
        // For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
        // This is more reliable for binary data than TextDecoder for legacy encodings
        return Array.from(buffer)
          .map(byte => String.fromCharCode(byte))
          .join('');
      }
    } catch (error) {
      console.warn(`Error decoding buffer using ${encoding}:`, error);
      // Return empty string on error to allow processing to continue
      return '';
    }
  }

  /**
   * Search for XML patterns in a string
   * @param content String to search in
   * @returns XML content or null if not found
   */
  private searchForXmlInString(content: string): string | null {
    if (!content) return null;
    
    // Search for each XML pattern
    for (const pattern of this.XML_PATTERNS) {
      const patternIndex = content.indexOf(pattern);
      if (patternIndex !== -1) {
        console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
        
        // Try to extract the XML content starting from the pattern position
        const xmlContent = this.extractXmlFromString(content, patternIndex);
        
        // Validate the extracted content
        if (xmlContent && this.isValidXml(xmlContent)) {
          console.log('Successfully extracted and validated XML from text');
          return xmlContent;
        }
      }
    }
    
    return null;
  }
}
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`import { BaseXMLExtractor } from './base.extractor.js';`

			`/**`
			`* Text-based XML extractor for PDF documents`
			`* Extracts XML by searching for XML patterns in the PDF text`
			`* Used as a fallback when other extraction methods fail`
			`*/`
			`export class TextXMLExtractor extends BaseXMLExtractor {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Maximum chunk size to process at once (4MB)`
			`private readonly CHUNK_SIZE = 4 * 1024 * 1024;`

			`// Maximum number of chunks to check (effective 20MB search limit)`
			`private readonly MAX_CHUNKS = 5;`

			`// Common XML patterns to look for`
			`private readonly XML_PATTERNS = [`
			`'<?xml',`
			`'<CrossIndustryInvoice',`
			`'<CrossIndustryDocument',`
			`'<Invoice',`
			`'<CreditNote',`
			`'<rsm:CrossIndustryInvoice',`
			`'<rsm:CrossIndustryDocument',`
			`'<ram:CrossIndustryDocument',`
			`'<ubl:Invoice',`
			`'<ubl:CreditNote',`
			`'<FatturaElettronica'`
			`];`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`/**`
			`* Extract XML from a PDF buffer by searching for XML patterns in the text`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`* Uses a chunked approach to handle large files efficiently`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`public async extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
			`try {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`console.log('Attempting text-based XML extraction from PDF...');`

			`// Convert Buffer to Uint8Array if needed`
			`const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;`

			`// Try extracting XML using the chunked approach`
			`return this.extractXmlFromBufferChunked(buffer);`
			`} catch (error) {`
			`console.error('Error in text-based extraction:', error);`
			`return null;`
			`}`
			`}`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`/**`
			`* Extract XML from buffer using a chunked approach`
			`* This helps avoid memory issues with large PDFs`
			`* @param buffer Buffer to search in`
			`* @returns XML content or null if not found`
			`*/`
			`private extractXmlFromBufferChunked(buffer: Uint8Array): string \| null {`
			`// Process the PDF in chunks`
			`for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {`
			`const startPos = chunkIndex * this.CHUNK_SIZE;`
			`if (startPos >= buffer.length) break;`

			`const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);`
			`const chunk = buffer.slice(startPos, endPos);`

			`// Try to extract XML from this chunk`
			`const chunkResult = this.processChunk(chunk, startPos);`
			`if (chunkResult) {`
			`return chunkResult;`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`}`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`}`

			`console.warn('No valid XML found in any chunk of the PDF');`
			`return null;`
			`}`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`/**`
			`* Process a single chunk of the PDF buffer`
			`* @param chunk Chunk buffer to process`
			`* @param chunkOffset Offset position of the chunk in the original buffer`
			`* @returns XML content or null if not found`
			`*/`
			`private processChunk(chunk: Uint8Array, chunkOffset: number): string \| null {`
			`try {`
			`// First try UTF-8 encoding for this chunk`
			`const utf8String = this.decodeBufferToString(chunk, 'utf-8');`
			`let xmlContent = this.searchForXmlInString(utf8String);`

			`if (xmlContent) {`
			console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
			`return xmlContent;`
			`}`

			`// If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better`
			`const latin1String = this.decodeBufferToString(chunk, 'latin1');`
			`xmlContent = this.searchForXmlInString(latin1String);`

			`if (xmlContent) {`
			console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
			`return xmlContent;`
			`}`

			`// No XML found in this chunk`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return null;`
			`} catch (error) {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return null;`
			`}`
			`}`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00
			`/**`
			`* Safely decode a buffer to string using the specified encoding`
			`* @param buffer Buffer to decode`
			`* @param encoding Encoding to use ('utf-8' or 'latin1')`
			`* @returns Decoded string`
			`*/`
			`private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' \| 'latin1'): string {`
			`try {`
			`if (encoding === 'utf-8') {`
			`return new TextDecoder('utf-8', { fatal: false }).decode(buffer);`
			`} else {`
			`// For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)`
			`// This is more reliable for binary data than TextDecoder for legacy encodings`
			`return Array.from(buffer)`
			`.map(byte => String.fromCharCode(byte))`
			`.join('');`
			`}`
			`} catch (error) {`
			console.warn(`Error decoding buffer using ${encoding}:`, error);
			`// Return empty string on error to allow processing to continue`
			`return '';`
			`}`
			`}`

			`/**`
			`* Search for XML patterns in a string`
			`* @param content String to search in`
			`* @returns XML content or null if not found`
			`*/`
			`private searchForXmlInString(content: string): string \| null {`
			`if (!content) return null;`

			`// Search for each XML pattern`
			`for (const pattern of this.XML_PATTERNS) {`
			`const patternIndex = content.indexOf(pattern);`
			`if (patternIndex !== -1) {`
			console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);

			`// Try to extract the XML content starting from the pattern position`
			`const xmlContent = this.extractXmlFromString(content, patternIndex);`

			`// Validate the extracted content`
			`if (xmlContent && this.isValidXml(xmlContent)) {`
			`console.log('Successfully extracted and validated XML from text');`
			`return xmlContent;`
			`}`
			`}`
			`}`

			`return null;`
			`}`
			`}`