2025-04-03 21:07:21 +00:00
|
|
|
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from '../../../plugins.js';
|
2025-04-03 20:08:02 +00:00
|
|
|
import { BaseXMLExtractor } from './base.extractor.js';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Associated files extractor for PDF/A-3 documents
|
|
|
|
* Extracts XML from associated files (AF entry in the catalog)
|
|
|
|
* Particularly useful for ZUGFeRD v1 and some Factur-X documents
|
|
|
|
*/
|
|
|
|
export class AssociatedFilesExtractor extends BaseXMLExtractor {
|
|
|
|
/**
|
|
|
|
* Extract XML from a PDF buffer using associated files
|
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
|
|
|
try {
|
|
|
|
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Try to find associated files via the AF entry in the catalog
|
|
|
|
const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
|
|
|
|
if (!(afArray instanceof PDFArray)) {
|
|
|
|
console.warn('No AF (Associated Files) entry found in PDF catalog');
|
|
|
|
return null;
|
|
|
|
}
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Process each associated file
|
|
|
|
for (let i = 0; i < afArray.size(); i++) {
|
|
|
|
const fileSpec = afArray.lookup(i);
|
|
|
|
if (!(fileSpec instanceof PDFDict)) {
|
|
|
|
continue;
|
|
|
|
}
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Get the file name
|
|
|
|
const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
|
|
|
|
if (!(fileNameObj instanceof PDFString)) {
|
|
|
|
continue;
|
|
|
|
}
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
const fileName = fileNameObj.decodeText();
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Check if it's a known invoice XML file name
|
|
|
|
const isKnownFileName = this.knownFileNames.some(
|
|
|
|
knownName => fileName.toLowerCase() === knownName.toLowerCase()
|
|
|
|
);
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Check if it's any XML file or has invoice-related keywords
|
2025-04-03 21:07:21 +00:00
|
|
|
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
|
2025-04-03 20:08:02 +00:00
|
|
|
fileName.toLowerCase().includes('zugferd') ||
|
|
|
|
fileName.toLowerCase().includes('factur-x') ||
|
|
|
|
fileName.toLowerCase().includes('xrechnung') ||
|
|
|
|
fileName.toLowerCase().includes('invoice');
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
if (isKnownFileName || isXmlFile) {
|
|
|
|
// Get the embedded file dictionary
|
|
|
|
const efDict = fileSpec.lookup(PDFName.of('EF'));
|
|
|
|
if (!(efDict instanceof PDFDict)) {
|
|
|
|
continue;
|
|
|
|
}
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Get the file stream
|
|
|
|
const fileStream = efDict.lookup(PDFName.of('F'));
|
|
|
|
if (fileStream instanceof PDFRawStream) {
|
|
|
|
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
|
|
|
|
if (xmlContent) {
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2025-04-03 21:07:21 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
console.warn('No valid XML found in associated files');
|
|
|
|
return null;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error in associated files extraction:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|