feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

This commit is contained in:
2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions

View File

@ -0,0 +1,78 @@
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import { BaseXMLExtractor } from './base.extractor.js';
/**
* Associated files extractor for PDF/A-3 documents
* Extracts XML from associated files (AF entry in the catalog)
* Particularly useful for ZUGFeRD v1 and some Factur-X documents
*/
export class AssociatedFilesExtractor extends BaseXMLExtractor {
/**
* Extract XML from a PDF buffer using associated files
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
const pdfDoc = await PDFDocument.load(pdfBuffer);
// Try to find associated files via the AF entry in the catalog
const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
if (!(afArray instanceof PDFArray)) {
console.warn('No AF (Associated Files) entry found in PDF catalog');
return null;
}
// Process each associated file
for (let i = 0; i < afArray.size(); i++) {
const fileSpec = afArray.lookup(i);
if (!(fileSpec instanceof PDFDict)) {
continue;
}
// Get the file name
const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
if (!(fileNameObj instanceof PDFString)) {
continue;
}
const fileName = fileNameObj.decodeText();
// Check if it's a known invoice XML file name
const isKnownFileName = this.knownFileNames.some(
knownName => fileName.toLowerCase() === knownName.toLowerCase()
);
// Check if it's any XML file or has invoice-related keywords
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
fileName.toLowerCase().includes('zugferd') ||
fileName.toLowerCase().includes('factur-x') ||
fileName.toLowerCase().includes('xrechnung') ||
fileName.toLowerCase().includes('invoice');
if (isKnownFileName || isXmlFile) {
// Get the embedded file dictionary
const efDict = fileSpec.lookup(PDFName.of('EF'));
if (!(efDict instanceof PDFDict)) {
continue;
}
// Get the file stream
const fileStream = efDict.lookup(PDFName.of('F'));
if (fileStream instanceof PDFRawStream) {
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
if (xmlContent) {
return xmlContent;
}
}
}
}
console.warn('No valid XML found in associated files');
return null;
} catch (error) {
console.error('Error in associated files extraction:', error);
return null;
}
}
}

View File

@ -0,0 +1,177 @@
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import * as pako from 'pako';
/**
* Base class for PDF XML extractors with common functionality
*/
export abstract class BaseXMLExtractor {
/**
* Known XML file names for different invoice formats
*/
protected readonly knownFileNames = [
'factur-x.xml',
'zugferd-invoice.xml',
'ZUGFeRD-invoice.xml',
'xrechnung.xml'
];
/**
* Known XML formats to validate extracted content
*/
protected readonly knownFormats = [
'CrossIndustryInvoice',
'CrossIndustryDocument',
'Invoice',
'CreditNote',
'ubl:Invoice',
'ubl:CreditNote',
'rsm:CrossIndustryInvoice',
'rsm:CrossIndustryDocument',
'ram:CrossIndustryDocument',
'urn:un:unece:uncefact',
'urn:ferd:CrossIndustryDocument',
'urn:zugferd',
'urn:factur-x',
'factur-x.eu',
'ZUGFeRD'
];
/**
* Known XML end tags for extracting content from strings
*/
protected readonly knownEndTags = [
'</CrossIndustryInvoice>',
'</CrossIndustryDocument>',
'</Invoice>',
'</CreditNote>',
'</rsm:CrossIndustryInvoice>',
'</rsm:CrossIndustryDocument>',
'</ram:CrossIndustryDocument>',
'</ubl:Invoice>',
'</ubl:CreditNote>'
];
/**
* Extract XML from a PDF buffer
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
/**
* Check if an XML string is valid
* @param xmlString XML string to check
* @returns True if the XML is valid
*/
protected isValidXml(xmlString: string): boolean {
try {
// Basic checks for XML validity
if (!xmlString || typeof xmlString !== 'string') {
return false;
}
// Check if it starts with XML declaration
if (!xmlString.includes('<?xml')) {
return false;
}
// Check if the XML string contains known invoice formats
const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
if (!hasKnownFormat) {
return false;
}
// Check if the XML string contains binary data or invalid characters
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
if (hasBinaryData) {
return false;
}
// Check if the XML string is too short
if (xmlString.length < 100) {
return false;
}
return true;
} catch (error) {
console.error('Error validating XML:', error);
return false;
}
}
/**
* Extract XML from a string
* @param text Text to extract XML from
* @param startIndex Index to start extraction from
* @returns XML content or null if not found
*/
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
try {
// Find the start of the XML document
const xmlStartIndex = text.indexOf('<?xml', startIndex);
if (xmlStartIndex === -1) {
return null;
}
// Try to find the end of the XML document
let xmlEndIndex = -1;
for (const endTag of this.knownEndTags) {
const endIndex = text.indexOf(endTag, xmlStartIndex);
if (endIndex !== -1) {
xmlEndIndex = endIndex + endTag.length;
break;
}
}
if (xmlEndIndex === -1) {
return null;
}
// Extract the XML content
return text.substring(xmlStartIndex, xmlEndIndex);
} catch (error) {
console.error('Error extracting XML from string:', error);
return null;
}
}
/**
* Decompress and decode XML content from a PDF stream
* @param stream PDF stream containing XML data
* @param fileName Name of the file (for logging)
* @returns XML content or null if not valid
*/
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
try {
// Try to decompress with pako
const compressedBytes = stream.getContents().buffer;
try {
const decompressedBytes = pako.inflate(compressedBytes);
const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
if (this.isValidXml(xmlContent)) {
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
return xmlContent;
}
} catch (decompressError) {
// Decompression failed, try without decompression
console.log(`Decompression failed for ${fileName}, trying without decompression...`);
}
// Try without decompression
const rawBytes = stream.getContents();
const rawContent = new TextDecoder('utf-8').decode(rawBytes);
if (this.isValidXml(rawContent)) {
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
return rawContent;
}
return null;
} catch (error) {
console.error('Error extracting XML from stream:', error);
return null;
}
}
}

View File

@ -0,0 +1,4 @@
export * from './base.extractor.js';
export * from './standard.extractor.js';
export * from './associated.extractor.js';
export * from './text.extractor.js';

View File

@ -0,0 +1,86 @@
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
import { BaseXMLExtractor } from './base.extractor.js';
/**
* Standard PDF XML extractor that extracts XML from embedded files
* Works with PDF/A-3 documents that follow the standard for embedding files
*/
export class StandardXMLExtractor extends BaseXMLExtractor {
/**
* Extract XML from a PDF buffer using standard PDF/A-3 embedded files
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
const pdfDoc = await PDFDocument.load(pdfBuffer);
// Get the document's metadata dictionary
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
if (!(namesDictObj instanceof PDFDict)) {
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
return null;
}
// Get the embedded files dictionary
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
if (!(embeddedFilesDictObj instanceof PDFDict)) {
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
return null;
}
// Get the names array
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
if (!(filesSpecObj instanceof PDFArray)) {
console.warn('No files specified in EmbeddedFiles dictionary!');
return null;
}
// Try to find an XML file in the embedded files
for (let i = 0; i < filesSpecObj.size(); i += 2) {
const fileNameObj = filesSpecObj.lookup(i);
const fileSpecObj = filesSpecObj.lookup(i + 1);
if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
continue;
}
// Get the filename as string
const fileName = fileNameObj.decodeText();
// Check if it's a known invoice XML file name
const isKnownFileName = this.knownFileNames.some(
knownName => fileName.toLowerCase() === knownName.toLowerCase()
);
// Check if it's any XML file or has invoice-related keywords
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
fileName.toLowerCase().includes('zugferd') ||
fileName.toLowerCase().includes('factur-x') ||
fileName.toLowerCase().includes('xrechnung') ||
fileName.toLowerCase().includes('invoice');
if (isKnownFileName || isXmlFile) {
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
if (!(efDictObj instanceof PDFDict)) {
continue;
}
const fileStream = efDictObj.lookup(PDFName.of('F'));
if (fileStream instanceof PDFRawStream) {
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
if (xmlContent) {
return xmlContent;
}
}
}
}
console.warn('No valid XML found in embedded files');
return null;
} catch (error) {
console.error('Error in standard extraction:', error);
return null;
}
}
}

View File

@ -0,0 +1,55 @@
import { BaseXMLExtractor } from './base.extractor.js';
/**
* Text-based XML extractor for PDF documents
* Extracts XML by searching for XML patterns in the PDF text
* Used as a fallback when other extraction methods fail
*/
export class TextXMLExtractor extends BaseXMLExtractor {
/**
* Extract XML from a PDF buffer by searching for XML patterns in the text
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
// Convert buffer to string and look for XML patterns
// Increase the search range to handle larger PDFs
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
// Look for common XML patterns in the PDF
const xmlPatterns = [
/<\?xml[^>]*\?>/i,
/<CrossIndustryInvoice[^>]*>/i,
/<CrossIndustryDocument[^>]*>/i,
/<Invoice[^>]*>/i,
/<CreditNote[^>]*>/i,
/<rsm:CrossIndustryInvoice[^>]*>/i,
/<rsm:CrossIndustryDocument[^>]*>/i,
/<ram:CrossIndustryDocument[^>]*>/i,
/<ubl:Invoice[^>]*>/i,
/<ubl:CreditNote[^>]*>/i
];
for (const pattern of xmlPatterns) {
const match = pdfString.match(pattern);
if (match && match.index !== undefined) {
console.log(`Found XML pattern in PDF: ${match[0]}`);
// Try to extract the XML content
const xmlContent = this.extractXmlFromString(pdfString, match.index);
if (xmlContent && this.isValidXml(xmlContent)) {
console.log('Successfully extracted XML from PDF text');
return xmlContent;
}
}
}
console.warn('No valid XML found in PDF text');
return null;
} catch (error) {
console.error('Error in text-based extraction:', error);
return null;
}
}
}