ts/formats/pdf/extractors/base.extractor.ts

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js';

/**
 * Base class for PDF XML extractors with common functionality
 */
export abstract class BaseXMLExtractor {
  /**
   * Known XML file names for different invoice formats
   */
  protected readonly knownFileNames = [
    'factur-x.xml',
    'zugferd-invoice.xml',
    'ZUGFeRD-invoice.xml',
    'xrechnung.xml',
    'ubl-invoice.xml',
    'invoice.xml',
    'metadata.xml'
  ];

  /**
   * Known XML formats to validate extracted content
   */
  protected readonly knownFormats = [
    'CrossIndustryInvoice',
    'CrossIndustryDocument',
    'Invoice',
    'CreditNote',
    'ubl:Invoice',
    'ubl:CreditNote',
    'rsm:CrossIndustryInvoice',
    'rsm:CrossIndustryDocument',
    'ram:CrossIndustryDocument',
    'urn:un:unece:uncefact',
    'urn:ferd:CrossIndustryDocument',
    'urn:zugferd',
    'urn:factur-x',
    'factur-x.eu',
    'ZUGFeRD',
    'FatturaElettronica'
  ];

  /**
   * Known XML end tags for extracting content from strings
   */
  protected readonly knownEndTags = [
    '</CrossIndustryInvoice>',
    '</CrossIndustryDocument>',
    '</Invoice>',
    '</CreditNote>',
    '</rsm:CrossIndustryInvoice>',
    '</rsm:CrossIndustryDocument>',
    '</ram:CrossIndustryDocument>',
    '</ubl:Invoice>',
    '</ubl:CreditNote>',
    '</FatturaElettronica>'
  ];

  /**
   * Extract XML from a PDF buffer
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;

  /**
   * Check if an XML string is valid
   * @param xmlString XML string to check
   * @returns True if the XML is valid
   */
  protected isValidXml(xmlString: string): boolean {
    try {
      // Basic checks for XML validity
      if (!xmlString || typeof xmlString !== 'string') {
        return false;
      }

      // Check if it starts with XML declaration or a valid element
      if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
        return false;
      }

      // Check if the XML string contains known invoice formats
      const hasKnownFormat = this.hasKnownFormat(xmlString);
      if (!hasKnownFormat) {
        return false;
      }

      // Check if the XML string contains binary data or invalid characters
      if (this.hasBinaryData(xmlString)) {
        return false;
      }

      // Check if the XML string is too short
      if (xmlString.length < 100) {
        return false;
      }

      // Check if XML has a proper structure (contains both opening and closing tags)
      if (!this.hasProperXmlStructure(xmlString)) {
        return false;
      }

      return true;
    } catch (error) {
      console.error('Error validating XML:', error);
      return false;
    }
  }

  /**
   * Check if the XML string contains a known element
   * @param xmlString XML string to check
   * @returns True if the XML contains a known element
   */
  protected hasKnownXmlElement(xmlString: string): boolean {
    for (const format of this.knownFormats) {
      // Check for opening tag of format
      if (xmlString.includes(`<${format}`)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Check if the XML string contains a known format
   * @param xmlString XML string to check
   * @returns True if the XML contains a known format
   */
  protected hasKnownFormat(xmlString: string): boolean {
    for (const format of this.knownFormats) {
      if (xmlString.includes(format)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Check if the XML string has a proper structure
   * @param xmlString XML string to check
   * @returns True if the XML has a proper structure
   */
  protected hasProperXmlStructure(xmlString: string): boolean {
    // Check for at least one matching opening and closing tag
    for (const endTag of this.knownEndTags) {
      const startTag = endTag.replace('/', '');
      if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
        return true;
      }
    }
    
    // If no specific tag is found but it has a basic XML structure
    return (
      (xmlString.includes('<?xml') && xmlString.includes('?>')) ||
      (xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
    );
  }

  /**
   * Check if the XML string contains binary data
   * @param xmlString XML string to check
   * @returns True if the XML contains binary data
   */
  protected hasBinaryData(xmlString: string): boolean {
    // Check for common binary data indicators
    const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
    const consecutiveNulls = '\u0000\u0000\u0000';
    
    // Check for control characters that shouldn't be in XML
    if (binaryChars.some(char => xmlString.includes(char))) {
      return true;
    }
    
    // Check for consecutive null bytes which indicate binary data
    if (xmlString.includes(consecutiveNulls)) {
      return true;
    }
    
    // Check for high concentration of non-printable characters
    const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
    if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
      return true;
    }
    
    return false;
  }

  /**
   * Extract XML from a string
   * @param text Text to extract XML from
   * @param startIndex Index to start extraction from
   * @returns XML content or null if not found
   */
  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
    try {
      // Find the start of the XML document
      let xmlStartIndex = text.indexOf('<?xml', startIndex);
      
      // If no XML declaration, try to find known elements
      if (xmlStartIndex === -1) {
        for (const format of this.knownFormats) {
          const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
          if (formatStartIndex !== -1) {
            xmlStartIndex = formatStartIndex;
            break;
          }
        }
        
        // Still didn't find any start marker
        if (xmlStartIndex === -1) {
          return null;
        }
      }

      // Try to find the end of the XML document
      let xmlEndIndex = -1;
      for (const endTag of this.knownEndTags) {
        const endIndex = text.indexOf(endTag, xmlStartIndex);
        if (endIndex !== -1) {
          xmlEndIndex = endIndex + endTag.length;
          break;
        }
      }

      // If no known end tag found, try to use a heuristic approach
      if (xmlEndIndex === -1) {
        // Try to find the last closing tag
        const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
        if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
          xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
        } else {
          return null;
        }
      }

      // Extract the XML content
      const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
      
      // Validate the extracted content
      if (this.isValidXml(xmlContent)) {
        return xmlContent;
      }
      
      return null;
    } catch (error) {
      console.error('Error extracting XML from string:', error);
      return null;
    }
  }

  /**
   * Decompress and decode XML content from a PDF stream
   * @param stream PDF stream containing XML data
   * @param fileName Name of the file (for logging)
   * @returns XML content or null if not valid
   */
  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
    try {
      // Get the raw bytes from the stream
      const rawBytes = stream.getContents();
      
      // First try without decompression (in case the content is not compressed)
      let xmlContent = this.tryDecodeBuffer(rawBytes);
      if (xmlContent && this.isValidXml(xmlContent)) {
        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
        return xmlContent;
      }
      
      // Try with decompression
      try {
        const decompressedBytes = this.tryDecompress(rawBytes);
        if (decompressedBytes) {
          xmlContent = this.tryDecodeBuffer(decompressedBytes);
          if (xmlContent && this.isValidXml(xmlContent)) {
            console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
            return xmlContent;
          }
        }
      } catch (decompressError) {
        console.log(`Decompression failed for ${fileName}: ${decompressError}`);
      }
      
      return null;
    } catch (error) {
      console.error('Error extracting XML from stream:', error);
      return null;
    }
  }
  
  /**
   * Try to decompress a buffer using different methods
   * @param buffer Buffer to decompress
   * @returns Decompressed buffer or null if decompression failed
   */
  protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
    try {
      // Try pako inflate (for deflate/zlib compression)
      return pako.inflate(buffer);
    } catch (error) {
      // If pako fails, try other methods if needed
      console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
      return null;
    }
  }
  
  /**
   * Try to decode a buffer to a string using different encodings
   * @param buffer Buffer to decode
   * @returns Decoded string or null if decoding failed
   */
  protected tryDecodeBuffer(buffer: Uint8Array): string | null {
    try {
      // Try UTF-8 first
      let content = new TextDecoder('utf-8').decode(buffer);
      if (this.isPlausibleXml(content)) {
        return content;
      }
      
      // Try ISO-8859-1 (Latin1)
      content = this.decodeLatin1(buffer);
      if (this.isPlausibleXml(content)) {
        return content;
      }
      
      return null;
    } catch (error) {
      console.warn('Error decoding buffer:', error);
      return null;
    }
  }
  
  /**
   * Decode a buffer using ISO-8859-1 (Latin1) encoding
   * @param buffer Buffer to decode
   * @returns Decoded string
   */
  protected decodeLatin1(buffer: Uint8Array): string {
    return Array.from(buffer)
      .map(byte => String.fromCharCode(byte))
      .join('');
  }
  
  /**
   * Check if a string is plausibly XML (quick check before validation)
   * @param content String to check
   * @returns True if the string is plausibly XML
   */
  protected isPlausibleXml(content: string): boolean {
    return content.includes('<') && 
           content.includes('>') && 
           (content.includes('<?xml') || 
            this.knownFormats.some(format => content.includes(format)));
  }
}
fix(readme): Update readme documentation: enhance feature summary, update installation instructions and usage examples, remove obsolete config details, and better clarify supported invoice formats. 2025-04-03 20:45:26 +00:00			`import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js';`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00
			`/**`
			`* Base class for PDF XML extractors with common functionality`
			`*/`
			`export abstract class BaseXMLExtractor {`
			`/**`
			`* Known XML file names for different invoice formats`
			`*/`
			`protected readonly knownFileNames = [`
			`'factur-x.xml',`
			`'zugferd-invoice.xml',`
			`'ZUGFeRD-invoice.xml',`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`'xrechnung.xml',`
			`'ubl-invoice.xml',`
			`'invoice.xml',`
			`'metadata.xml'`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`];`

			`/**`
			`* Known XML formats to validate extracted content`
			`*/`
			`protected readonly knownFormats = [`
			`'CrossIndustryInvoice',`
			`'CrossIndustryDocument',`
			`'Invoice',`
			`'CreditNote',`
			`'ubl:Invoice',`
			`'ubl:CreditNote',`
			`'rsm:CrossIndustryInvoice',`
			`'rsm:CrossIndustryDocument',`
			`'ram:CrossIndustryDocument',`
			`'urn:un:unece:uncefact',`
			`'urn:ferd:CrossIndustryDocument',`
			`'urn:zugferd',`
			`'urn:factur-x',`
			`'factur-x.eu',`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`'ZUGFeRD',`
			`'FatturaElettronica'`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`];`

			`/**`
			`* Known XML end tags for extracting content from strings`
			`*/`
			`protected readonly knownEndTags = [`
			`'</CrossIndustryInvoice>',`
			`'</CrossIndustryDocument>',`
			`'</Invoice>',`
			`'</CreditNote>',`
			`'</rsm:CrossIndustryInvoice>',`
			`'</rsm:CrossIndustryDocument>',`
			`'</ram:CrossIndustryDocument>',`
			`'</ubl:Invoice>',`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`'</ubl:CreditNote>',`
			`'</FatturaElettronica>'`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`];`

			`/**`
			`* Extract XML from a PDF buffer`
			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`public abstract extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null>;`

			`/**`
			`* Check if an XML string is valid`
			`* @param xmlString XML string to check`
			`* @returns True if the XML is valid`
			`*/`
			`protected isValidXml(xmlString: string): boolean {`
			`try {`
			`// Basic checks for XML validity`
			`if (!xmlString \|\| typeof xmlString !== 'string') {`
			`return false;`
			`}`

fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Check if it starts with XML declaration or a valid element`
			`if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return false;`
			`}`

			`// Check if the XML string contains known invoice formats`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`const hasKnownFormat = this.hasKnownFormat(xmlString);`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`if (!hasKnownFormat) {`
			`return false;`
			`}`

			`// Check if the XML string contains binary data or invalid characters`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`if (this.hasBinaryData(xmlString)) {`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return false;`
			`}`

			`// Check if the XML string is too short`
			`if (xmlString.length < 100) {`
			`return false;`
			`}`

fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Check if XML has a proper structure (contains both opening and closing tags)`
			`if (!this.hasProperXmlStructure(xmlString)) {`
			`return false;`
			`}`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return true;`
			`} catch (error) {`
			`console.error('Error validating XML:', error);`
			`return false;`
			`}`
			`}`

fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`/**`
			`* Check if the XML string contains a known element`
			`* @param xmlString XML string to check`
			`* @returns True if the XML contains a known element`
			`*/`
			`protected hasKnownXmlElement(xmlString: string): boolean {`
			`for (const format of this.knownFormats) {`
			`// Check for opening tag of format`
			if (xmlString.includes(`<${format}`)) {
			`return true;`
			`}`
			`}`
			`return false;`
			`}`

			`/**`
			`* Check if the XML string contains a known format`
			`* @param xmlString XML string to check`
			`* @returns True if the XML contains a known format`
			`*/`
			`protected hasKnownFormat(xmlString: string): boolean {`
			`for (const format of this.knownFormats) {`
			`if (xmlString.includes(format)) {`
			`return true;`
			`}`
			`}`
			`return false;`
			`}`

			`/**`
			`* Check if the XML string has a proper structure`
			`* @param xmlString XML string to check`
			`* @returns True if the XML has a proper structure`
			`*/`
			`protected hasProperXmlStructure(xmlString: string): boolean {`
			`// Check for at least one matching opening and closing tag`
			`for (const endTag of this.knownEndTags) {`
			`const startTag = endTag.replace('/', '');`
			`if (xmlString.includes(startTag) && xmlString.includes(endTag)) {`
			`return true;`
			`}`
			`}`

			`// If no specific tag is found but it has a basic XML structure`
			`return (`
			`(xmlString.includes('<?xml') && xmlString.includes('?>')) \|\|`
			`(xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)`
			`);`
			`}`

			`/**`
			`* Check if the XML string contains binary data`
			`* @param xmlString XML string to check`
			`* @returns True if the XML contains binary data`
			`*/`
			`protected hasBinaryData(xmlString: string): boolean {`
			`// Check for common binary data indicators`
			`const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];`
			`const consecutiveNulls = '\u0000\u0000\u0000';`

			`// Check for control characters that shouldn't be in XML`
			`if (binaryChars.some(char => xmlString.includes(char))) {`
			`return true;`
			`}`

			`// Check for consecutive null bytes which indicate binary data`
			`if (xmlString.includes(consecutiveNulls)) {`
			`return true;`
			`}`

			`// Check for high concentration of non-printable characters`
			`const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) \|\| []).length;`
			`if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable`
			`return true;`
			`}`

			`return false;`
			`}`

feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`/**`
			`* Extract XML from a string`
			`* @param text Text to extract XML from`
			`* @param startIndex Index to start extraction from`
			`* @returns XML content or null if not found`
			`*/`
			`protected extractXmlFromString(text: string, startIndex: number = 0): string \| null {`
			`try {`
			`// Find the start of the XML document`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`let xmlStartIndex = text.indexOf('<?xml', startIndex);`

			`// If no XML declaration, try to find known elements`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`if (xmlStartIndex === -1) {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`for (const format of this.knownFormats) {`
			const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
			`if (formatStartIndex !== -1) {`
			`xmlStartIndex = formatStartIndex;`
			`break;`
			`}`
			`}`

			`// Still didn't find any start marker`
			`if (xmlStartIndex === -1) {`
			`return null;`
			`}`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`}`

			`// Try to find the end of the XML document`
			`let xmlEndIndex = -1;`
			`for (const endTag of this.knownEndTags) {`
			`const endIndex = text.indexOf(endTag, xmlStartIndex);`
			`if (endIndex !== -1) {`
			`xmlEndIndex = endIndex + endTag.length;`
			`break;`
			`}`
			`}`

fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// If no known end tag found, try to use a heuristic approach`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`if (xmlEndIndex === -1) {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Try to find the last closing tag`
			`const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);`
			`if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {`
			`xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;`
			`} else {`
			`return null;`
			`}`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`}`

			`// Extract the XML content`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);`

			`// Validate the extracted content`
			`if (this.isValidXml(xmlContent)) {`
			`return xmlContent;`
			`}`

			`return null;`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`} catch (error) {`
			`console.error('Error extracting XML from string:', error);`
			`return null;`
			`}`
			`}`

			`/**`
			`* Decompress and decode XML content from a PDF stream`
			`* @param stream PDF stream containing XML data`
			`* @param fileName Name of the file (for logging)`
			`* @returns XML content or null if not valid`
			`*/`
			`protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string \| null> {`
			`try {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`// Get the raw bytes from the stream`
			`const rawBytes = stream.getContents();`

			`// First try without decompression (in case the content is not compressed)`
			`let xmlContent = this.tryDecodeBuffer(rawBytes);`
			`if (xmlContent && this.isValidXml(xmlContent)) {`
			console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
			`return xmlContent;`
			`}`

			`// Try with decompression`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`try {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			`const decompressedBytes = this.tryDecompress(rawBytes);`
			`if (decompressedBytes) {`
			`xmlContent = this.tryDecodeBuffer(decompressedBytes);`
			`if (xmlContent && this.isValidXml(xmlContent)) {`
			console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
			`return xmlContent;`
			`}`
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`}`
			`} catch (decompressError) {`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00			console.log(`Decompression failed for ${fileName}: ${decompressError}`);
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`}`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00
feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic 2025-04-03 20:08:02 +00:00			`return null;`
			`} catch (error) {`
			`console.error('Error extracting XML from stream:', error);`
			`return null;`
			`}`
			`}`
fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. 2025-04-04 12:14:41 +00:00
			`/**`
			`* Try to decompress a buffer using different methods`
			`* @param buffer Buffer to decompress`
			`* @returns Decompressed buffer or null if decompression failed`
			`*/`
			`protected tryDecompress(buffer: Uint8Array): Uint8Array \| null {`
			`try {`
			`// Try pako inflate (for deflate/zlib compression)`
			`return pako.inflate(buffer);`
			`} catch (error) {`
			`// If pako fails, try other methods if needed`
			`console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');`
			`return null;`
			`}`
			`}`

			`/**`
			`* Try to decode a buffer to a string using different encodings`
			`* @param buffer Buffer to decode`
			`* @returns Decoded string or null if decoding failed`
			`*/`
			`protected tryDecodeBuffer(buffer: Uint8Array): string \| null {`
			`try {`
			`// Try UTF-8 first`
			`let content = new TextDecoder('utf-8').decode(buffer);`
			`if (this.isPlausibleXml(content)) {`
			`return content;`
			`}`

			`// Try ISO-8859-1 (Latin1)`
			`content = this.decodeLatin1(buffer);`
			`if (this.isPlausibleXml(content)) {`
			`return content;`
			`}`

			`return null;`
			`} catch (error) {`
			`console.warn('Error decoding buffer:', error);`
			`return null;`
			`}`
			`}`

			`/**`
			`* Decode a buffer using ISO-8859-1 (Latin1) encoding`
			`* @param buffer Buffer to decode`
			`* @returns Decoded string`
			`*/`
			`protected decodeLatin1(buffer: Uint8Array): string {`
			`return Array.from(buffer)`
			`.map(byte => String.fromCharCode(byte))`
			`.join('');`
			`}`

			`/**`
			`* Check if a string is plausibly XML (quick check before validation)`
			`* @param content String to check`
			`* @returns True if the string is plausibly XML`
			`*/`
			`protected isPlausibleXml(content: string): boolean {`
			`return content.includes('<') &&`
			`content.includes('>') &&`
			`(content.includes('<?xml') \|\|`
			`this.knownFormats.some(format => content.includes(format)));`
			`}`
			`}`