xinvoice/ts/formats/pdf/pdf.extractor.ts

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import * as pako from 'pako';

/**
 * Class for extracting XML from PDF files
 */
export class PDFExtractor {
  /**
   * Extracts XML from a PDF buffer
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      // First try the standard extraction
      const standardXml = await this.standardExtraction(pdfBuffer);
      if (standardXml && this.isValidXml(standardXml)) {
        return standardXml;
      }

      // If standard extraction fails, try alternative methods
      const alternativeXml = await this.alternativeExtraction(pdfBuffer);
      if (alternativeXml && this.isValidXml(alternativeXml)) {
        return alternativeXml;
      }

      // If all extraction methods fail, return null
      console.warn('All extraction methods failed, no valid XML found in PDF');
      return null;
    } catch (error) {
      console.error('Error extracting XML from PDF:', error);
      return null;
    }
  }

  /**
   * Standard extraction method using PDF-lib
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      const pdfDoc = await PDFDocument.load(pdfBuffer);

      // Get the document's metadata dictionary
      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
      if (!(namesDictObj instanceof PDFDict)) {
        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
        return null;
      }

      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
      if (!(embeddedFilesDictObj instanceof PDFDict)) {
        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
        return null;
      }

      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
      if (!(filesSpecObj instanceof PDFArray)) {
        console.warn('No files specified in EmbeddedFiles dictionary!');
        return null;
      }

      // Try to find an XML file in the embedded files
      let xmlFile: PDFRawStream | undefined;
      let xmlFileName: string | undefined;

      for (let i = 0; i < filesSpecObj.size(); i += 2) {
        const fileNameObj = filesSpecObj.lookup(i);
        const fileSpecObj = filesSpecObj.lookup(i + 1);

        if (!(fileNameObj instanceof PDFString)) {
          continue;
        }
        if (!(fileSpecObj instanceof PDFDict)) {
          continue;
        }

        // Get the filename as string
        const fileName = fileNameObj.toString();

        // Check if it's an XML file (checking both extension and known standard filenames)
        if (fileName.toLowerCase().includes('.xml') ||
            fileName.toLowerCase().includes('factur-x') ||
            fileName.toLowerCase().includes('zugferd') ||
            fileName.toLowerCase().includes('xrechnung')) {

          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
          if (!(efDictObj instanceof PDFDict)) {
            continue;
          }

          const maybeStream = efDictObj.lookup(PDFName.of('F'));
          if (maybeStream instanceof PDFRawStream) {
            // Found an XML file - save it
            xmlFile = maybeStream;
            xmlFileName = fileName;
            break;
          }
        }
      }

      // If no XML file was found, return null
      if (!xmlFile) {
        console.warn('No embedded XML file found in the PDF!');
        return null;
      }

      // Decompress and decode the XML content
      try {
        // Try to decompress with pako
        const xmlCompressedBytes = xmlFile.getContents().buffer;
        const xmlBytes = pako.inflate(xmlCompressedBytes);
        const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);

        // Check if the XML content is valid
        if (this.isValidXml(xmlContent)) {
          console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
          return xmlContent;
        }

        // If we get here, the XML content is not valid, try without decompression
        console.log('Decompression succeeded but XML is not valid, trying without decompression...');
        const rawXmlBytes = xmlFile.getContents();
        const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);

        if (this.isValidXml(rawXmlContent)) {
          console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
          return rawXmlContent;
        }

        // If we get here, neither the decompressed nor the raw XML content is valid
        console.log('Neither decompressed nor raw XML content is valid');
        return null;
      } catch (decompressError) {
        // Decompression failed, try without decompression
        console.log('Decompression failed, trying without decompression...');
        try {
          const xmlBytes = xmlFile.getContents();
          const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);

          if (this.isValidXml(xmlContent)) {
            console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
            return xmlContent;
          }

          // If we get here, the XML content is not valid
          console.log('Uncompressed XML content is not valid');
          return null;
        } catch (decodeError) {
          console.error('Error decoding XML content:', decodeError);
          return null;
        }
      }
    } catch (error) {
      console.error('Error in standard extraction:', error);
      return null;
    }
  }

  /**
   * Alternative extraction method using string search
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      // Convert buffer to string and look for XML patterns
      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));

      // Look for common XML patterns in the PDF
      const xmlPatterns = [
        /<\?xml[^>]*\?>/i,
        /<CrossIndustryInvoice[^>]*>/i,
        /<Invoice[^>]*>/i,
        /<CreditNote[^>]*>/i,
        /<rsm:CrossIndustryInvoice[^>]*>/i
      ];

      for (const pattern of xmlPatterns) {
        const match = pdfString.match(pattern);
        if (match) {
          console.log(`Found XML pattern in PDF: ${match[0]}`);

          // Try to extract the XML content
          const xmlContent = this.extractXmlFromString(pdfString);
          if (xmlContent) {
            console.log('Successfully extracted XML from PDF string');
            return xmlContent;
          }
        }
      }

      return null;
    } catch (error) {
      console.error('Error in alternative extraction:', error);
      return null;
    }
  }

  /**
   * Extracts XML from a string
   * @param pdfString PDF string
   * @returns XML content or null if not found
   */
  private extractXmlFromString(pdfString: string): string | null {
    try {
      // Look for XML start and end tags
      const xmlStartIndex = pdfString.indexOf('<?xml');
      if (xmlStartIndex === -1) {
        return null;
      }

      // Try to find the end of the XML document
      const possibleEndTags = [
        '</CrossIndustryInvoice>',
        '</Invoice>',
        '</CreditNote>',
        '</rsm:CrossIndustryInvoice>'
      ];

      let xmlEndIndex = -1;
      for (const endTag of possibleEndTags) {
        const endIndex = pdfString.indexOf(endTag);
        if (endIndex !== -1) {
          xmlEndIndex = endIndex + endTag.length;
          break;
        }
      }

      if (xmlEndIndex === -1) {
        return null;
      }

      // Extract the XML content
      return pdfString.substring(xmlStartIndex, xmlEndIndex);
    } catch (error) {
      console.error('Error extracting XML from string:', error);
      return null;
    }
  }

  /**
   * Checks if an XML string is valid
   * @param xmlString XML string to check
   * @returns True if the XML is valid
   */
  private isValidXml(xmlString: string): boolean {
    try {
      // Check if the XML string contains basic XML structure
      if (!xmlString.includes('<?xml')) {
        return false;
      }

      // Check if the XML string contains known invoice formats
      const knownFormats = [
        'CrossIndustryInvoice',
        'Invoice',
        'CreditNote',
        'ubl:Invoice',
        'ubl:CreditNote'
      ];

      const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
      if (!hasKnownFormat) {
        return false;
      }

      // Check if the XML string contains binary data or invalid characters
      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
      if (hasBinaryData) {
        return false;
      }

      // Check if the XML string is too short
      if (xmlString.length < 100) {
        return false;
      }

      return true;
    } catch (error) {
      console.error('Error validating XML:', error);
      return false;
    }
  }
}
update 2025-04-03 15:53:08 +00:00			`import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';`
			`import * as pako from 'pako';`

			`/**`
			`* Class for extracting XML from PDF files`
			`*/`
			`export class PDFExtractor {`
			`/**`
			`* Extracts XML from a PDF buffer`
			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`public async extractXml(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
update 2025-04-03 17:21:36 +00:00			`try {`
			`// First try the standard extraction`
			`const standardXml = await this.standardExtraction(pdfBuffer);`
			`if (standardXml && this.isValidXml(standardXml)) {`
			`return standardXml;`
			`}`

			`// If standard extraction fails, try alternative methods`
			`const alternativeXml = await this.alternativeExtraction(pdfBuffer);`
			`if (alternativeXml && this.isValidXml(alternativeXml)) {`
			`return alternativeXml;`
			`}`

			`// If all extraction methods fail, return null`
			`console.warn('All extraction methods failed, no valid XML found in PDF');`
			`return null;`
			`} catch (error) {`
			`console.error('Error extracting XML from PDF:', error);`
			`return null;`
			`}`
			`}`

			`/**`
			`* Standard extraction method using PDF-lib`
			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`private async standardExtraction(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
update 2025-04-03 15:53:08 +00:00			`try {`
			`const pdfDoc = await PDFDocument.load(pdfBuffer);`

			`// Get the document's metadata dictionary`
			`const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));`
			`if (!(namesDictObj instanceof PDFDict)) {`
			`console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');`
			`return null;`
			`}`

			`const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));`
			`if (!(embeddedFilesDictObj instanceof PDFDict)) {`
			`console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');`
			`return null;`
			`}`

			`const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));`
			`if (!(filesSpecObj instanceof PDFArray)) {`
			`console.warn('No files specified in EmbeddedFiles dictionary!');`
			`return null;`
			`}`

			`// Try to find an XML file in the embedded files`
			`let xmlFile: PDFRawStream \| undefined;`
			`let xmlFileName: string \| undefined;`

			`for (let i = 0; i < filesSpecObj.size(); i += 2) {`
			`const fileNameObj = filesSpecObj.lookup(i);`
			`const fileSpecObj = filesSpecObj.lookup(i + 1);`

			`if (!(fileNameObj instanceof PDFString)) {`
			`continue;`
			`}`
			`if (!(fileSpecObj instanceof PDFDict)) {`
			`continue;`
			`}`

			`// Get the filename as string`
			`const fileName = fileNameObj.toString();`
update 2025-04-03 17:21:36 +00:00
update 2025-04-03 15:53:08 +00:00			`// Check if it's an XML file (checking both extension and known standard filenames)`
update 2025-04-03 17:21:36 +00:00			`if (fileName.toLowerCase().includes('.xml') \|\|`
update 2025-04-03 15:53:08 +00:00			`fileName.toLowerCase().includes('factur-x') \|\|`
			`fileName.toLowerCase().includes('zugferd') \|\|`
			`fileName.toLowerCase().includes('xrechnung')) {`
update 2025-04-03 17:21:36 +00:00
update 2025-04-03 15:53:08 +00:00			`const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));`
			`if (!(efDictObj instanceof PDFDict)) {`
			`continue;`
			`}`

			`const maybeStream = efDictObj.lookup(PDFName.of('F'));`
			`if (maybeStream instanceof PDFRawStream) {`
			`// Found an XML file - save it`
			`xmlFile = maybeStream;`
			`xmlFileName = fileName;`
			`break;`
			`}`
			`}`
			`}`

			`// If no XML file was found, return null`
			`if (!xmlFile) {`
			`console.warn('No embedded XML file found in the PDF!');`
			`return null;`
			`}`

			`// Decompress and decode the XML content`
working 2025-04-03 16:41:10 +00:00			`try {`
update 2025-04-03 17:21:36 +00:00			`// Try to decompress with pako`
working 2025-04-03 16:41:10 +00:00			`const xmlCompressedBytes = xmlFile.getContents().buffer;`
			`const xmlBytes = pako.inflate(xmlCompressedBytes);`
			`const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);`
update 2025-04-03 15:53:08 +00:00
update 2025-04-03 17:21:36 +00:00			`// Check if the XML content is valid`
			`if (this.isValidXml(xmlContent)) {`
			console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
			`return xmlContent;`
			`}`

			`// If we get here, the XML content is not valid, try without decompression`
			`console.log('Decompression succeeded but XML is not valid, trying without decompression...');`
			`const rawXmlBytes = xmlFile.getContents();`
			`const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);`

			`if (this.isValidXml(rawXmlContent)) {`
			console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
			`return rawXmlContent;`
			`}`

			`// If we get here, neither the decompressed nor the raw XML content is valid`
			`console.log('Neither decompressed nor raw XML content is valid');`
			`return null;`
working 2025-04-03 16:41:10 +00:00			`} catch (decompressError) {`
update 2025-04-03 17:21:36 +00:00			`// Decompression failed, try without decompression`
working 2025-04-03 16:41:10 +00:00			`console.log('Decompression failed, trying without decompression...');`
			`try {`
			`const xmlBytes = xmlFile.getContents();`
			`const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);`
update 2025-04-03 17:21:36 +00:00
			`if (this.isValidXml(xmlContent)) {`
			console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
			`return xmlContent;`
			`}`

			`// If we get here, the XML content is not valid`
			`console.log('Uncompressed XML content is not valid');`
			`return null;`
working 2025-04-03 16:41:10 +00:00			`} catch (decodeError) {`
			`console.error('Error decoding XML content:', decodeError);`
			`return null;`
			`}`
			`}`
update 2025-04-03 15:53:08 +00:00			`} catch (error) {`
update 2025-04-03 17:21:36 +00:00			`console.error('Error in standard extraction:', error);`
			`return null;`
			`}`
			`}`

			`/**`
			`* Alternative extraction method using string search`
			`* @param pdfBuffer PDF buffer`
			`* @returns XML content or null if not found`
			`*/`
			`private async alternativeExtraction(pdfBuffer: Uint8Array \| Buffer): Promise<string \| null> {`
			`try {`
			`// Convert buffer to string and look for XML patterns`
			`const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));`

			`// Look for common XML patterns in the PDF`
			`const xmlPatterns = [`
			`/<\?xml[^>]*\?>/i,`
			`/<CrossIndustryInvoice[^>]*>/i,`
			`/<Invoice[^>]*>/i,`
			`/<CreditNote[^>]*>/i,`
			`/<rsm:CrossIndustryInvoice[^>]*>/i`
			`];`

			`for (const pattern of xmlPatterns) {`
			`const match = pdfString.match(pattern);`
			`if (match) {`
			console.log(`Found XML pattern in PDF: ${match[0]}`);

			`// Try to extract the XML content`
			`const xmlContent = this.extractXmlFromString(pdfString);`
			`if (xmlContent) {`
			`console.log('Successfully extracted XML from PDF string');`
			`return xmlContent;`
			`}`
			`}`
			`}`

			`return null;`
			`} catch (error) {`
			`console.error('Error in alternative extraction:', error);`
			`return null;`
			`}`
			`}`

			`/**`
			`* Extracts XML from a string`
			`* @param pdfString PDF string`
			`* @returns XML content or null if not found`
			`*/`
			`private extractXmlFromString(pdfString: string): string \| null {`
			`try {`
			`// Look for XML start and end tags`
			`const xmlStartIndex = pdfString.indexOf('<?xml');`
			`if (xmlStartIndex === -1) {`
			`return null;`
			`}`

			`// Try to find the end of the XML document`
			`const possibleEndTags = [`
			`'</CrossIndustryInvoice>',`
			`'</Invoice>',`
			`'</CreditNote>',`
			`'</rsm:CrossIndustryInvoice>'`
			`];`

			`let xmlEndIndex = -1;`
			`for (const endTag of possibleEndTags) {`
			`const endIndex = pdfString.indexOf(endTag);`
			`if (endIndex !== -1) {`
			`xmlEndIndex = endIndex + endTag.length;`
			`break;`
			`}`
			`}`

			`if (xmlEndIndex === -1) {`
			`return null;`
			`}`

			`// Extract the XML content`
			`return pdfString.substring(xmlStartIndex, xmlEndIndex);`
			`} catch (error) {`
			`console.error('Error extracting XML from string:', error);`
working 2025-04-03 16:41:10 +00:00			`return null;`
update 2025-04-03 15:53:08 +00:00			`}`
			`}`
update 2025-04-03 17:21:36 +00:00
			`/**`
			`* Checks if an XML string is valid`
			`* @param xmlString XML string to check`
			`* @returns True if the XML is valid`
			`*/`
			`private isValidXml(xmlString: string): boolean {`
			`try {`
			`// Check if the XML string contains basic XML structure`
			`if (!xmlString.includes('<?xml')) {`
			`return false;`
			`}`

			`// Check if the XML string contains known invoice formats`
			`const knownFormats = [`
			`'CrossIndustryInvoice',`
			`'Invoice',`
			`'CreditNote',`
			`'ubl:Invoice',`
			`'ubl:CreditNote'`
			`];`

			`const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));`
			`if (!hasKnownFormat) {`
			`return false;`
			`}`

			`// Check if the XML string contains binary data or invalid characters`
			`const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];`
			`const hasBinaryData = invalidChars.some(char => xmlString.includes(char));`
			`if (hasBinaryData) {`
			`return false;`
			`}`

			`// Check if the XML string is too short`
			`if (xmlString.length < 100) {`
			`return false;`
			`}`

			`return true;`
			`} catch (error) {`
			`console.error('Error validating XML:', error);`
			`return false;`
			`}`
			`}`
update 2025-04-03 15:53:08 +00:00			`}`