xinvoice/ts/formats/pdf/pdf.extractor.ts

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import * as pako from 'pako';

/**
 * Class for extracting XML from PDF files
 */
export class PDFExtractor {
  /**
   * Extracts XML from a PDF buffer
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
      const pdfDoc = await PDFDocument.load(pdfBuffer);

      // Get the document's metadata dictionary
      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
      if (!(namesDictObj instanceof PDFDict)) {
        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
        return null;
      }

      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
      if (!(embeddedFilesDictObj instanceof PDFDict)) {
        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
        return null;
      }

      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
      if (!(filesSpecObj instanceof PDFArray)) {
        console.warn('No files specified in EmbeddedFiles dictionary!');
        return null;
      }

      // Try to find an XML file in the embedded files
      let xmlFile: PDFRawStream | undefined;
      let xmlFileName: string | undefined;

      for (let i = 0; i < filesSpecObj.size(); i += 2) {
        const fileNameObj = filesSpecObj.lookup(i);
        const fileSpecObj = filesSpecObj.lookup(i + 1);

        if (!(fileNameObj instanceof PDFString)) {
          continue;
        }
        if (!(fileSpecObj instanceof PDFDict)) {
          continue;
        }

        // Get the filename as string
        const fileName = fileNameObj.toString();

        // Check if it's an XML file (checking both extension and known standard filenames)
        if (fileName.toLowerCase().includes('.xml') ||
            fileName.toLowerCase().includes('factur-x') ||
            fileName.toLowerCase().includes('zugferd') ||
            fileName.toLowerCase().includes('xrechnung')) {

          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
          if (!(efDictObj instanceof PDFDict)) {
            continue;
          }

          const maybeStream = efDictObj.lookup(PDFName.of('F'));
          if (maybeStream instanceof PDFRawStream) {
            // Found an XML file - save it
            xmlFile = maybeStream;
            xmlFileName = fileName;
            break;
          }
        }
      }

      // If no XML file was found, return null
      if (!xmlFile) {
        console.warn('No embedded XML file found in the PDF!');
        return null;
      }

      // Decompress and decode the XML content
      const xmlCompressedBytes = xmlFile.getContents().buffer;
      const xmlBytes = pako.inflate(xmlCompressedBytes);
      const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);

      console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);

      return xmlContent;
    } catch (error) {
      console.error('Error extracting or parsing embedded XML from PDF:', error);
      throw error;
    }
  }
}