update
This commit is contained in:
77
ts/formats/pdf/pdf.embedder.ts
Normal file
77
ts/formats/pdf/pdf.embedder.ts
Normal file
@ -0,0 +1,77 @@
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import type { IPdf } from '../../interfaces/common.js';
|
||||
|
||||
/**
|
||||
* Class for embedding XML into PDF files
|
||||
*/
|
||||
export class PDFEmbedder {
|
||||
/**
|
||||
* Embeds XML into a PDF
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @param xmlContent XML content to embed
|
||||
* @param filename Filename for the embedded XML
|
||||
* @param description Description for the embedded XML
|
||||
* @returns Modified PDF buffer
|
||||
*/
|
||||
public async embedXml(
|
||||
pdfBuffer: Uint8Array | Buffer,
|
||||
xmlContent: string,
|
||||
filename: string = 'invoice.xml',
|
||||
description: string = 'XML Invoice'
|
||||
): Promise<Uint8Array> {
|
||||
try {
|
||||
// Load the PDF
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Convert the XML string to a Uint8Array
|
||||
const xmlBuffer = new TextEncoder().encode(xmlContent);
|
||||
|
||||
// Make sure filename is lowercase (as required by documentation)
|
||||
filename = filename.toLowerCase();
|
||||
|
||||
// Use pdf-lib's .attach() to embed the XML
|
||||
pdfDoc.attach(xmlBuffer, filename, {
|
||||
mimeType: 'application/xml',
|
||||
description: description,
|
||||
});
|
||||
|
||||
// Save the modified PDF
|
||||
const modifiedPdfBytes = await pdfDoc.save();
|
||||
|
||||
return modifiedPdfBytes;
|
||||
} catch (error) {
|
||||
console.error('Error embedding XML into PDF:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an IPdf object with embedded XML
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @param xmlContent XML content to embed
|
||||
* @param filename Filename for the embedded XML
|
||||
* @param description Description for the embedded XML
|
||||
* @param pdfName Name for the PDF
|
||||
* @param pdfId ID for the PDF
|
||||
* @returns IPdf object with embedded XML
|
||||
*/
|
||||
public async createPdfWithXml(
|
||||
pdfBuffer: Uint8Array | Buffer,
|
||||
xmlContent: string,
|
||||
filename: string = 'invoice.xml',
|
||||
description: string = 'XML Invoice',
|
||||
pdfName: string = 'invoice.pdf',
|
||||
pdfId: string = `invoice-${Date.now()}`
|
||||
): Promise<IPdf> {
|
||||
const modifiedPdfBytes = await this.embedXml(pdfBuffer, xmlContent, filename, description);
|
||||
|
||||
return {
|
||||
name: pdfName,
|
||||
id: pdfId,
|
||||
metadata: {
|
||||
textExtraction: ''
|
||||
},
|
||||
buffer: modifiedPdfBytes
|
||||
};
|
||||
}
|
||||
}
|
94
ts/formats/pdf/pdf.extractor.ts
Normal file
94
ts/formats/pdf/pdf.extractor.ts
Normal file
@ -0,0 +1,94 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import * as pako from 'pako';
|
||||
|
||||
/**
|
||||
* Class for extracting XML from PDF files
|
||||
*/
|
||||
export class PDFExtractor {
|
||||
/**
|
||||
* Extracts XML from a PDF buffer
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Get the document's metadata dictionary
|
||||
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
||||
if (!(namesDictObj instanceof PDFDict)) {
|
||||
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
||||
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
||||
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
||||
if (!(filesSpecObj instanceof PDFArray)) {
|
||||
console.warn('No files specified in EmbeddedFiles dictionary!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find an XML file in the embedded files
|
||||
let xmlFile: PDFRawStream | undefined;
|
||||
let xmlFileName: string | undefined;
|
||||
|
||||
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
||||
const fileNameObj = filesSpecObj.lookup(i);
|
||||
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
||||
|
||||
if (!(fileNameObj instanceof PDFString)) {
|
||||
continue;
|
||||
}
|
||||
if (!(fileSpecObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the filename as string
|
||||
const fileName = fileNameObj.toString();
|
||||
|
||||
// Check if it's an XML file (checking both extension and known standard filenames)
|
||||
if (fileName.toLowerCase().includes('.xml') ||
|
||||
fileName.toLowerCase().includes('factur-x') ||
|
||||
fileName.toLowerCase().includes('zugferd') ||
|
||||
fileName.toLowerCase().includes('xrechnung')) {
|
||||
|
||||
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
||||
if (!(efDictObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const maybeStream = efDictObj.lookup(PDFName.of('F'));
|
||||
if (maybeStream instanceof PDFRawStream) {
|
||||
// Found an XML file - save it
|
||||
xmlFile = maybeStream;
|
||||
xmlFileName = fileName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no XML file was found, return null
|
||||
if (!xmlFile) {
|
||||
console.warn('No embedded XML file found in the PDF!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Decompress and decode the XML content
|
||||
const xmlCompressedBytes = xmlFile.getContents().buffer;
|
||||
const xmlBytes = pako.inflate(xmlCompressedBytes);
|
||||
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
||||
|
||||
console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
|
||||
|
||||
return xmlContent;
|
||||
} catch (error) {
|
||||
console.error('Error extracting or parsing embedded XML from PDF:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user