feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic
This commit is contained in:
177
ts/formats/pdf/extractors/base.extractor.ts
Normal file
177
ts/formats/pdf/extractors/base.extractor.ts
Normal file
@ -0,0 +1,177 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import * as pako from 'pako';
|
||||
|
||||
/**
|
||||
* Base class for PDF XML extractors with common functionality
|
||||
*/
|
||||
export abstract class BaseXMLExtractor {
|
||||
/**
|
||||
* Known XML file names for different invoice formats
|
||||
*/
|
||||
protected readonly knownFileNames = [
|
||||
'factur-x.xml',
|
||||
'zugferd-invoice.xml',
|
||||
'ZUGFeRD-invoice.xml',
|
||||
'xrechnung.xml'
|
||||
];
|
||||
|
||||
/**
|
||||
* Known XML formats to validate extracted content
|
||||
*/
|
||||
protected readonly knownFormats = [
|
||||
'CrossIndustryInvoice',
|
||||
'CrossIndustryDocument',
|
||||
'Invoice',
|
||||
'CreditNote',
|
||||
'ubl:Invoice',
|
||||
'ubl:CreditNote',
|
||||
'rsm:CrossIndustryInvoice',
|
||||
'rsm:CrossIndustryDocument',
|
||||
'ram:CrossIndustryDocument',
|
||||
'urn:un:unece:uncefact',
|
||||
'urn:ferd:CrossIndustryDocument',
|
||||
'urn:zugferd',
|
||||
'urn:factur-x',
|
||||
'factur-x.eu',
|
||||
'ZUGFeRD'
|
||||
];
|
||||
|
||||
/**
|
||||
* Known XML end tags for extracting content from strings
|
||||
*/
|
||||
protected readonly knownEndTags = [
|
||||
'</CrossIndustryInvoice>',
|
||||
'</CrossIndustryDocument>',
|
||||
'</Invoice>',
|
||||
'</CreditNote>',
|
||||
'</rsm:CrossIndustryInvoice>',
|
||||
'</rsm:CrossIndustryDocument>',
|
||||
'</ram:CrossIndustryDocument>',
|
||||
'</ubl:Invoice>',
|
||||
'</ubl:CreditNote>'
|
||||
];
|
||||
|
||||
/**
|
||||
* Extract XML from a PDF buffer
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
|
||||
|
||||
/**
|
||||
* Check if an XML string is valid
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML is valid
|
||||
*/
|
||||
protected isValidXml(xmlString: string): boolean {
|
||||
try {
|
||||
// Basic checks for XML validity
|
||||
if (!xmlString || typeof xmlString !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if it starts with XML declaration
|
||||
if (!xmlString.includes('<?xml')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains known invoice formats
|
||||
const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
|
||||
if (!hasKnownFormat) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains binary data or invalid characters
|
||||
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
||||
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
|
||||
if (hasBinaryData) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string is too short
|
||||
if (xmlString.length < 100) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error validating XML:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract XML from a string
|
||||
* @param text Text to extract XML from
|
||||
* @param startIndex Index to start extraction from
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
|
||||
try {
|
||||
// Find the start of the XML document
|
||||
const xmlStartIndex = text.indexOf('<?xml', startIndex);
|
||||
if (xmlStartIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find the end of the XML document
|
||||
let xmlEndIndex = -1;
|
||||
for (const endTag of this.knownEndTags) {
|
||||
const endIndex = text.indexOf(endTag, xmlStartIndex);
|
||||
if (endIndex !== -1) {
|
||||
xmlEndIndex = endIndex + endTag.length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (xmlEndIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract the XML content
|
||||
return text.substring(xmlStartIndex, xmlEndIndex);
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from string:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompress and decode XML content from a PDF stream
|
||||
* @param stream PDF stream containing XML data
|
||||
* @param fileName Name of the file (for logging)
|
||||
* @returns XML content or null if not valid
|
||||
*/
|
||||
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
|
||||
try {
|
||||
// Try to decompress with pako
|
||||
const compressedBytes = stream.getContents().buffer;
|
||||
try {
|
||||
const decompressedBytes = pako.inflate(compressedBytes);
|
||||
const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
|
||||
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
} catch (decompressError) {
|
||||
// Decompression failed, try without decompression
|
||||
console.log(`Decompression failed for ${fileName}, trying without decompression...`);
|
||||
}
|
||||
|
||||
// Try without decompression
|
||||
const rawBytes = stream.getContents();
|
||||
const rawContent = new TextDecoder('utf-8').decode(rawBytes);
|
||||
|
||||
if (this.isValidXml(rawContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
|
||||
return rawContent;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from stream:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user