95 lines
3.2 KiB
TypeScript
95 lines
3.2 KiB
TypeScript
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
|
import * as pako from 'pako';
|
|
|
|
/**
|
|
* Class for extracting XML from PDF files
|
|
*/
|
|
export class PDFExtractor {
|
|
/**
|
|
* Extracts XML from a PDF buffer
|
|
* @param pdfBuffer PDF buffer
|
|
* @returns XML content or null if not found
|
|
*/
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
|
try {
|
|
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
|
|
|
// Get the document's metadata dictionary
|
|
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
|
if (!(namesDictObj instanceof PDFDict)) {
|
|
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
|
return null;
|
|
}
|
|
|
|
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
|
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
|
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
|
return null;
|
|
}
|
|
|
|
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
|
if (!(filesSpecObj instanceof PDFArray)) {
|
|
console.warn('No files specified in EmbeddedFiles dictionary!');
|
|
return null;
|
|
}
|
|
|
|
// Try to find an XML file in the embedded files
|
|
let xmlFile: PDFRawStream | undefined;
|
|
let xmlFileName: string | undefined;
|
|
|
|
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
|
const fileNameObj = filesSpecObj.lookup(i);
|
|
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
|
|
|
if (!(fileNameObj instanceof PDFString)) {
|
|
continue;
|
|
}
|
|
if (!(fileSpecObj instanceof PDFDict)) {
|
|
continue;
|
|
}
|
|
|
|
// Get the filename as string
|
|
const fileName = fileNameObj.toString();
|
|
|
|
// Check if it's an XML file (checking both extension and known standard filenames)
|
|
if (fileName.toLowerCase().includes('.xml') ||
|
|
fileName.toLowerCase().includes('factur-x') ||
|
|
fileName.toLowerCase().includes('zugferd') ||
|
|
fileName.toLowerCase().includes('xrechnung')) {
|
|
|
|
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
|
if (!(efDictObj instanceof PDFDict)) {
|
|
continue;
|
|
}
|
|
|
|
const maybeStream = efDictObj.lookup(PDFName.of('F'));
|
|
if (maybeStream instanceof PDFRawStream) {
|
|
// Found an XML file - save it
|
|
xmlFile = maybeStream;
|
|
xmlFileName = fileName;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no XML file was found, return null
|
|
if (!xmlFile) {
|
|
console.warn('No embedded XML file found in the PDF!');
|
|
return null;
|
|
}
|
|
|
|
// Decompress and decode the XML content
|
|
const xmlCompressedBytes = xmlFile.getContents().buffer;
|
|
const xmlBytes = pako.inflate(xmlCompressedBytes);
|
|
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
|
|
|
console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
|
|
|
|
return xmlContent;
|
|
} catch (error) {
|
|
console.error('Error extracting or parsing embedded XML from PDF:', error);
|
|
throw error;
|
|
}
|
|
}
|
|
}
|