2025-04-03 15:53:08 +00:00
|
|
|
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
|
|
|
import * as pako from 'pako';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class for extracting XML from PDF files
|
|
|
|
*/
|
|
|
|
export class PDFExtractor {
|
|
|
|
/**
|
|
|
|
* Extracts XML from a PDF buffer
|
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
2025-04-03 17:21:36 +00:00
|
|
|
try {
|
|
|
|
// First try the standard extraction
|
|
|
|
const standardXml = await this.standardExtraction(pdfBuffer);
|
|
|
|
if (standardXml && this.isValidXml(standardXml)) {
|
|
|
|
return standardXml;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If standard extraction fails, try alternative methods
|
|
|
|
const alternativeXml = await this.alternativeExtraction(pdfBuffer);
|
|
|
|
if (alternativeXml && this.isValidXml(alternativeXml)) {
|
|
|
|
return alternativeXml;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If all extraction methods fail, return null
|
|
|
|
console.warn('All extraction methods failed, no valid XML found in PDF');
|
|
|
|
return null;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error extracting XML from PDF:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Standard extraction method using PDF-lib
|
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
2025-04-03 15:53:08 +00:00
|
|
|
try {
|
|
|
|
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
|
|
|
|
|
|
|
// Get the document's metadata dictionary
|
|
|
|
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
|
|
|
if (!(namesDictObj instanceof PDFDict)) {
|
|
|
|
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
|
|
|
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
|
|
|
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
|
|
|
if (!(filesSpecObj instanceof PDFArray)) {
|
|
|
|
console.warn('No files specified in EmbeddedFiles dictionary!');
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to find an XML file in the embedded files
|
|
|
|
let xmlFile: PDFRawStream | undefined;
|
|
|
|
let xmlFileName: string | undefined;
|
|
|
|
|
|
|
|
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
|
|
|
const fileNameObj = filesSpecObj.lookup(i);
|
|
|
|
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
|
|
|
|
|
|
|
if (!(fileNameObj instanceof PDFString)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!(fileSpecObj instanceof PDFDict)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the filename as string
|
|
|
|
const fileName = fileNameObj.toString();
|
2025-04-03 17:21:36 +00:00
|
|
|
|
2025-04-03 15:53:08 +00:00
|
|
|
// Check if it's an XML file (checking both extension and known standard filenames)
|
2025-04-03 17:21:36 +00:00
|
|
|
if (fileName.toLowerCase().includes('.xml') ||
|
2025-04-03 15:53:08 +00:00
|
|
|
fileName.toLowerCase().includes('factur-x') ||
|
|
|
|
fileName.toLowerCase().includes('zugferd') ||
|
|
|
|
fileName.toLowerCase().includes('xrechnung')) {
|
2025-04-03 17:21:36 +00:00
|
|
|
|
2025-04-03 15:53:08 +00:00
|
|
|
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
|
|
|
if (!(efDictObj instanceof PDFDict)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const maybeStream = efDictObj.lookup(PDFName.of('F'));
|
|
|
|
if (maybeStream instanceof PDFRawStream) {
|
|
|
|
// Found an XML file - save it
|
|
|
|
xmlFile = maybeStream;
|
|
|
|
xmlFileName = fileName;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no XML file was found, return null
|
|
|
|
if (!xmlFile) {
|
|
|
|
console.warn('No embedded XML file found in the PDF!');
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decompress and decode the XML content
|
2025-04-03 16:41:10 +00:00
|
|
|
try {
|
2025-04-03 17:21:36 +00:00
|
|
|
// Try to decompress with pako
|
2025-04-03 16:41:10 +00:00
|
|
|
const xmlCompressedBytes = xmlFile.getContents().buffer;
|
|
|
|
const xmlBytes = pako.inflate(xmlCompressedBytes);
|
|
|
|
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
2025-04-03 15:53:08 +00:00
|
|
|
|
2025-04-03 17:21:36 +00:00
|
|
|
// Check if the XML content is valid
|
|
|
|
if (this.isValidXml(xmlContent)) {
|
|
|
|
console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get here, the XML content is not valid, try without decompression
|
|
|
|
console.log('Decompression succeeded but XML is not valid, trying without decompression...');
|
|
|
|
const rawXmlBytes = xmlFile.getContents();
|
|
|
|
const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);
|
|
|
|
|
|
|
|
if (this.isValidXml(rawXmlContent)) {
|
|
|
|
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
|
|
|
|
return rawXmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get here, neither the decompressed nor the raw XML content is valid
|
|
|
|
console.log('Neither decompressed nor raw XML content is valid');
|
|
|
|
return null;
|
2025-04-03 16:41:10 +00:00
|
|
|
} catch (decompressError) {
|
2025-04-03 17:21:36 +00:00
|
|
|
// Decompression failed, try without decompression
|
2025-04-03 16:41:10 +00:00
|
|
|
console.log('Decompression failed, trying without decompression...');
|
|
|
|
try {
|
|
|
|
const xmlBytes = xmlFile.getContents();
|
|
|
|
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
2025-04-03 17:21:36 +00:00
|
|
|
|
|
|
|
if (this.isValidXml(xmlContent)) {
|
|
|
|
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get here, the XML content is not valid
|
|
|
|
console.log('Uncompressed XML content is not valid');
|
|
|
|
return null;
|
2025-04-03 16:41:10 +00:00
|
|
|
} catch (decodeError) {
|
|
|
|
console.error('Error decoding XML content:', decodeError);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2025-04-03 15:53:08 +00:00
|
|
|
} catch (error) {
|
2025-04-03 17:21:36 +00:00
|
|
|
console.error('Error in standard extraction:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Alternative extraction method using string search
|
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
|
|
|
try {
|
|
|
|
// Convert buffer to string and look for XML patterns
|
|
|
|
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));
|
|
|
|
|
|
|
|
// Look for common XML patterns in the PDF
|
|
|
|
const xmlPatterns = [
|
|
|
|
/<\?xml[^>]*\?>/i,
|
|
|
|
/<CrossIndustryInvoice[^>]*>/i,
|
|
|
|
/<Invoice[^>]*>/i,
|
|
|
|
/<CreditNote[^>]*>/i,
|
|
|
|
/<rsm:CrossIndustryInvoice[^>]*>/i
|
|
|
|
];
|
|
|
|
|
|
|
|
for (const pattern of xmlPatterns) {
|
|
|
|
const match = pdfString.match(pattern);
|
|
|
|
if (match) {
|
|
|
|
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
|
|
|
|
|
|
|
// Try to extract the XML content
|
|
|
|
const xmlContent = this.extractXmlFromString(pdfString);
|
|
|
|
if (xmlContent) {
|
|
|
|
console.log('Successfully extracted XML from PDF string');
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error in alternative extraction:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extracts XML from a string
|
|
|
|
* @param pdfString PDF string
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private extractXmlFromString(pdfString: string): string | null {
|
|
|
|
try {
|
|
|
|
// Look for XML start and end tags
|
|
|
|
const xmlStartIndex = pdfString.indexOf('<?xml');
|
|
|
|
if (xmlStartIndex === -1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to find the end of the XML document
|
|
|
|
const possibleEndTags = [
|
|
|
|
'</CrossIndustryInvoice>',
|
|
|
|
'</Invoice>',
|
|
|
|
'</CreditNote>',
|
|
|
|
'</rsm:CrossIndustryInvoice>'
|
|
|
|
];
|
|
|
|
|
|
|
|
let xmlEndIndex = -1;
|
|
|
|
for (const endTag of possibleEndTags) {
|
|
|
|
const endIndex = pdfString.indexOf(endTag);
|
|
|
|
if (endIndex !== -1) {
|
|
|
|
xmlEndIndex = endIndex + endTag.length;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xmlEndIndex === -1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Extract the XML content
|
|
|
|
return pdfString.substring(xmlStartIndex, xmlEndIndex);
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error extracting XML from string:', error);
|
2025-04-03 16:41:10 +00:00
|
|
|
return null;
|
2025-04-03 15:53:08 +00:00
|
|
|
}
|
|
|
|
}
|
2025-04-03 17:21:36 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Checks if an XML string is valid
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML is valid
|
|
|
|
*/
|
|
|
|
private isValidXml(xmlString: string): boolean {
|
|
|
|
try {
|
|
|
|
// Check if the XML string contains basic XML structure
|
|
|
|
if (!xmlString.includes('<?xml')) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string contains known invoice formats
|
|
|
|
const knownFormats = [
|
|
|
|
'CrossIndustryInvoice',
|
|
|
|
'Invoice',
|
|
|
|
'CreditNote',
|
|
|
|
'ubl:Invoice',
|
|
|
|
'ubl:CreditNote'
|
|
|
|
];
|
|
|
|
|
|
|
|
const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
|
|
|
|
if (!hasKnownFormat) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string contains binary data or invalid characters
|
|
|
|
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
|
|
|
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
|
|
|
|
if (hasBinaryData) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string is too short
|
|
|
|
if (xmlString.length < 100) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error validating XML:', error);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2025-04-03 15:53:08 +00:00
|
|
|
}
|