update

2025-04-03 15:53:08 +00:00
parent 3e8b5c2869
commit 21650f1181
49 changed files with 4835 additions and 2878 deletions
--- a/ts/formats/pdf/pdf.embedder.ts
+++ b/ts/formats/pdf/pdf.embedder.ts
@@ -0,0 +1,77 @@
+import { PDFDocument } from 'pdf-lib';
+import type { IPdf } from '../../interfaces/common.js';
+
+/**
+ * Class for embedding XML into PDF files
+ */
+export class PDFEmbedder {
+  /**
+   * Embeds XML into a PDF
+   * @param pdfBuffer PDF buffer
+   * @param xmlContent XML content to embed
+   * @param filename Filename for the embedded XML
+   * @param description Description for the embedded XML
+   * @returns Modified PDF buffer
+   */
+  public async embedXml(
+    pdfBuffer: Uint8Array | Buffer,
+    xmlContent: string,
+    filename: string = 'invoice.xml',
+    description: string = 'XML Invoice'
+  ): Promise<Uint8Array> {
+    try {
+      // Load the PDF
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+
+      // Convert the XML string to a Uint8Array
+      const xmlBuffer = new TextEncoder().encode(xmlContent);
+
+      // Make sure filename is lowercase (as required by documentation)
+      filename = filename.toLowerCase();
+
+      // Use pdf-lib's .attach() to embed the XML
+      pdfDoc.attach(xmlBuffer, filename, {
+        mimeType: 'application/xml',
+        description: description,
+      });
+
+      // Save the modified PDF
+      const modifiedPdfBytes = await pdfDoc.save();
+
+      return modifiedPdfBytes;
+    } catch (error) {
+      console.error('Error embedding XML into PDF:', error);
+      throw error;
+    }
+  }
+
+  /**
+   * Creates an IPdf object with embedded XML
+   * @param pdfBuffer PDF buffer
+   * @param xmlContent XML content to embed
+   * @param filename Filename for the embedded XML
+   * @param description Description for the embedded XML
+   * @param pdfName Name for the PDF
+   * @param pdfId ID for the PDF
+   * @returns IPdf object with embedded XML
+   */
+  public async createPdfWithXml(
+    pdfBuffer: Uint8Array | Buffer,
+    xmlContent: string,
+    filename: string = 'invoice.xml',
+    description: string = 'XML Invoice',
+    pdfName: string = 'invoice.pdf',
+    pdfId: string = `invoice-${Date.now()}`
+  ): Promise<IPdf> {
+    const modifiedPdfBytes = await this.embedXml(pdfBuffer, xmlContent, filename, description);
+
+    return {
+      name: pdfName,
+      id: pdfId,
+      metadata: {
+        textExtraction: ''
+      },
+      buffer: modifiedPdfBytes
+    };
+  }
+}
--- a/ts/formats/pdf/pdf.extractor.ts
+++ b/ts/formats/pdf/pdf.extractor.ts
@@ -0,0 +1,94 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import * as pako from 'pako';
+
+/**
+ * Class for extracting XML from PDF files
+ */
+export class PDFExtractor {
+  /**
+   * Extracts XML from a PDF buffer
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+
+      // Get the document's metadata dictionary
+      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
+      if (!(namesDictObj instanceof PDFDict)) {
+        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
+      if (!(embeddedFilesDictObj instanceof PDFDict)) {
+        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
+      if (!(filesSpecObj instanceof PDFArray)) {
+        console.warn('No files specified in EmbeddedFiles dictionary!');
+        return null;
+      }
+
+      // Try to find an XML file in the embedded files
+      let xmlFile: PDFRawStream | undefined;
+      let xmlFileName: string | undefined;
+
+      for (let i = 0; i < filesSpecObj.size(); i += 2) {
+        const fileNameObj = filesSpecObj.lookup(i);
+        const fileSpecObj = filesSpecObj.lookup(i + 1);
+
+        if (!(fileNameObj instanceof PDFString)) {
+          continue;
+        }
+        if (!(fileSpecObj instanceof PDFDict)) {
+          continue;
+        }
+
+        // Get the filename as string
+        const fileName = fileNameObj.toString();
+        
+        // Check if it's an XML file (checking both extension and known standard filenames)
+        if (fileName.toLowerCase().includes('.xml') || 
+            fileName.toLowerCase().includes('factur-x') ||
+            fileName.toLowerCase().includes('zugferd') ||
+            fileName.toLowerCase().includes('xrechnung')) {
+            
+          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
+          if (!(efDictObj instanceof PDFDict)) {
+            continue;
+          }
+
+          const maybeStream = efDictObj.lookup(PDFName.of('F'));
+          if (maybeStream instanceof PDFRawStream) {
+            // Found an XML file - save it
+            xmlFile = maybeStream;
+            xmlFileName = fileName;
+            break;
+          }
+        }
+      }
+
+      // If no XML file was found, return null
+      if (!xmlFile) {
+        console.warn('No embedded XML file found in the PDF!');
+        return null;
+      }
+
+      // Decompress and decode the XML content
+      const xmlCompressedBytes = xmlFile.getContents().buffer;
+      const xmlBytes = pako.inflate(xmlCompressedBytes);
+      const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
+
+      console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
+      
+      return xmlContent;
+    } catch (error) {
+      console.error('Error extracting or parsing embedded XML from PDF:', error);
+      throw error;
+    }
+  }
+}