xinvoice/ts/formats/pdf/pdf.extractor.ts
2025-04-03 15:53:08 +00:00

95 lines
3.2 KiB
TypeScript

import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import * as pako from 'pako';
/**
* Class for extracting XML from PDF files
*/
export class PDFExtractor {
/**
* Extracts XML from a PDF buffer
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
const pdfDoc = await PDFDocument.load(pdfBuffer);
// Get the document's metadata dictionary
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
if (!(namesDictObj instanceof PDFDict)) {
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
return null;
}
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
if (!(embeddedFilesDictObj instanceof PDFDict)) {
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
return null;
}
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
if (!(filesSpecObj instanceof PDFArray)) {
console.warn('No files specified in EmbeddedFiles dictionary!');
return null;
}
// Try to find an XML file in the embedded files
let xmlFile: PDFRawStream | undefined;
let xmlFileName: string | undefined;
for (let i = 0; i < filesSpecObj.size(); i += 2) {
const fileNameObj = filesSpecObj.lookup(i);
const fileSpecObj = filesSpecObj.lookup(i + 1);
if (!(fileNameObj instanceof PDFString)) {
continue;
}
if (!(fileSpecObj instanceof PDFDict)) {
continue;
}
// Get the filename as string
const fileName = fileNameObj.toString();
// Check if it's an XML file (checking both extension and known standard filenames)
if (fileName.toLowerCase().includes('.xml') ||
fileName.toLowerCase().includes('factur-x') ||
fileName.toLowerCase().includes('zugferd') ||
fileName.toLowerCase().includes('xrechnung')) {
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
if (!(efDictObj instanceof PDFDict)) {
continue;
}
const maybeStream = efDictObj.lookup(PDFName.of('F'));
if (maybeStream instanceof PDFRawStream) {
// Found an XML file - save it
xmlFile = maybeStream;
xmlFileName = fileName;
break;
}
}
}
// If no XML file was found, return null
if (!xmlFile) {
console.warn('No embedded XML file found in the PDF!');
return null;
}
// Decompress and decode the XML content
const xmlCompressedBytes = xmlFile.getContents().buffer;
const xmlBytes = pako.inflate(xmlCompressedBytes);
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
return xmlContent;
} catch (error) {
console.error('Error extracting or parsing embedded XML from PDF:', error);
throw error;
}
}
}