einvoice/ts/formats/pdf/extractors/text.extractor.ts

import { BaseXMLExtractor } from './base.extractor.js';

/**
 * Text-based XML extractor for PDF documents
 * Extracts XML by searching for XML patterns in the PDF text
 * Used as a fallback when other extraction methods fail
 */
export class TextXMLExtractor extends BaseXMLExtractor {
  /**
   * Extract XML from a PDF buffer by searching for XML patterns in the text
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      // Convert buffer to string and look for XML patterns
      // Increase the search range to handle larger PDFs
      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));

      // Look for common XML patterns in the PDF
      const xmlPatterns = [
        /<\?xml[^>]*\?>/i,
        /<CrossIndustryInvoice[^>]*>/i,
        /<CrossIndustryDocument[^>]*>/i,
        /<Invoice[^>]*>/i,
        /<CreditNote[^>]*>/i,
        /<rsm:CrossIndustryInvoice[^>]*>/i,
        /<rsm:CrossIndustryDocument[^>]*>/i,
        /<ram:CrossIndustryDocument[^>]*>/i,
        /<ubl:Invoice[^>]*>/i,
        /<ubl:CreditNote[^>]*>/i
      ];

      for (const pattern of xmlPatterns) {
        const match = pdfString.match(pattern);
        if (match && match.index !== undefined) {
          console.log(`Found XML pattern in PDF: ${match[0]}`);

          // Try to extract the XML content
          const xmlContent = this.extractXmlFromString(pdfString, match.index);
          if (xmlContent && this.isValidXml(xmlContent)) {
            console.log('Successfully extracted XML from PDF text');
            return xmlContent;
          }
        }
      }

      console.warn('No valid XML found in PDF text');
      return null;
    } catch (error) {
      console.error('Error in text-based extraction:', error);
      return null;
    }
  }
}