xinvoice/ts/formats/pdf/extractors/base.extractor.ts

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js';

/**
 * Base class for PDF XML extractors with common functionality
 */
export abstract class BaseXMLExtractor {
  /**
   * Known XML file names for different invoice formats
   */
  protected readonly knownFileNames = [
    'factur-x.xml',
    'zugferd-invoice.xml',
    'ZUGFeRD-invoice.xml',
    'xrechnung.xml'
  ];

  /**
   * Known XML formats to validate extracted content
   */
  protected readonly knownFormats = [
    'CrossIndustryInvoice',
    'CrossIndustryDocument',
    'Invoice',
    'CreditNote',
    'ubl:Invoice',
    'ubl:CreditNote',
    'rsm:CrossIndustryInvoice',
    'rsm:CrossIndustryDocument',
    'ram:CrossIndustryDocument',
    'urn:un:unece:uncefact',
    'urn:ferd:CrossIndustryDocument',
    'urn:zugferd',
    'urn:factur-x',
    'factur-x.eu',
    'ZUGFeRD'
  ];

  /**
   * Known XML end tags for extracting content from strings
   */
  protected readonly knownEndTags = [
    '</CrossIndustryInvoice>',
    '</CrossIndustryDocument>',
    '</Invoice>',
    '</CreditNote>',
    '</rsm:CrossIndustryInvoice>',
    '</rsm:CrossIndustryDocument>',
    '</ram:CrossIndustryDocument>',
    '</ubl:Invoice>',
    '</ubl:CreditNote>'
  ];

  /**
   * Extract XML from a PDF buffer
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;

  /**
   * Check if an XML string is valid
   * @param xmlString XML string to check
   * @returns True if the XML is valid
   */
  protected isValidXml(xmlString: string): boolean {
    try {
      // Basic checks for XML validity
      if (!xmlString || typeof xmlString !== 'string') {
        return false;
      }

      // Check if it starts with XML declaration
      if (!xmlString.includes('<?xml')) {
        return false;
      }

      // Check if the XML string contains known invoice formats
      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
      if (!hasKnownFormat) {
        return false;
      }

      // Check if the XML string contains binary data or invalid characters
      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
      if (hasBinaryData) {
        return false;
      }

      // Check if the XML string is too short
      if (xmlString.length < 100) {
        return false;
      }

      return true;
    } catch (error) {
      console.error('Error validating XML:', error);
      return false;
    }
  }

  /**
   * Extract XML from a string
   * @param text Text to extract XML from
   * @param startIndex Index to start extraction from
   * @returns XML content or null if not found
   */
  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
    try {
      // Find the start of the XML document
      const xmlStartIndex = text.indexOf('<?xml', startIndex);
      if (xmlStartIndex === -1) {
        return null;
      }

      // Try to find the end of the XML document
      let xmlEndIndex = -1;
      for (const endTag of this.knownEndTags) {
        const endIndex = text.indexOf(endTag, xmlStartIndex);
        if (endIndex !== -1) {
          xmlEndIndex = endIndex + endTag.length;
          break;
        }
      }

      if (xmlEndIndex === -1) {
        return null;
      }

      // Extract the XML content
      return text.substring(xmlStartIndex, xmlEndIndex);
    } catch (error) {
      console.error('Error extracting XML from string:', error);
      return null;
    }
  }

  /**
   * Decompress and decode XML content from a PDF stream
   * @param stream PDF stream containing XML data
   * @param fileName Name of the file (for logging)
   * @returns XML content or null if not valid
   */
  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
    try {
      // Try to decompress with pako
      const compressedBytes = stream.getContents().buffer;
      try {
        const decompressedBytes = pako.inflate(compressedBytes);
        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);

        if (this.isValidXml(xmlContent)) {
          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
          return xmlContent;
        }
      } catch (decompressError) {
        // Decompression failed, try without decompression
        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
      }

      // Try without decompression
      const rawBytes = stream.getContents();
      const rawContent = new TextDecoder('utf-8').decode(rawBytes);

      if (this.isValidXml(rawContent)) {
        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
        return rawContent;
      }

      return null;
    } catch (error) {
      console.error('Error extracting XML from stream:', error);
      return null;
    }
  }
}