feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic
This commit is contained in:
86
ts/formats/pdf/extractors/standard.extractor.ts
Normal file
86
ts/formats/pdf/extractors/standard.extractor.ts
Normal file
@ -0,0 +1,86 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import { BaseXMLExtractor } from './base.extractor.js';
|
||||
|
||||
/**
|
||||
* Standard PDF XML extractor that extracts XML from embedded files
|
||||
* Works with PDF/A-3 documents that follow the standard for embedding files
|
||||
*/
|
||||
export class StandardXMLExtractor extends BaseXMLExtractor {
|
||||
/**
|
||||
* Extract XML from a PDF buffer using standard PDF/A-3 embedded files
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Get the document's metadata dictionary
|
||||
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
||||
if (!(namesDictObj instanceof PDFDict)) {
|
||||
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get the embedded files dictionary
|
||||
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
||||
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
||||
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get the names array
|
||||
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
||||
if (!(filesSpecObj instanceof PDFArray)) {
|
||||
console.warn('No files specified in EmbeddedFiles dictionary!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find an XML file in the embedded files
|
||||
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
||||
const fileNameObj = filesSpecObj.lookup(i);
|
||||
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
||||
|
||||
if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the filename as string
|
||||
const fileName = fileNameObj.decodeText();
|
||||
|
||||
// Check if it's a known invoice XML file name
|
||||
const isKnownFileName = this.knownFileNames.some(
|
||||
knownName => fileName.toLowerCase() === knownName.toLowerCase()
|
||||
);
|
||||
|
||||
// Check if it's any XML file or has invoice-related keywords
|
||||
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
|
||||
fileName.toLowerCase().includes('zugferd') ||
|
||||
fileName.toLowerCase().includes('factur-x') ||
|
||||
fileName.toLowerCase().includes('xrechnung') ||
|
||||
fileName.toLowerCase().includes('invoice');
|
||||
|
||||
if (isKnownFileName || isXmlFile) {
|
||||
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
||||
if (!(efDictObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const fileStream = efDictObj.lookup(PDFName.of('F'));
|
||||
if (fileStream instanceof PDFRawStream) {
|
||||
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
|
||||
if (xmlContent) {
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in embedded files');
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in standard extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user