2025-04-03 20:45:26 +00:00
|
|
|
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js';
|
2025-04-03 20:08:02 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Base class for PDF XML extractors with common functionality
|
|
|
|
*/
|
|
|
|
export abstract class BaseXMLExtractor {
|
|
|
|
/**
|
|
|
|
* Known XML file names for different invoice formats
|
|
|
|
*/
|
|
|
|
protected readonly knownFileNames = [
|
|
|
|
'factur-x.xml',
|
|
|
|
'zugferd-invoice.xml',
|
|
|
|
'ZUGFeRD-invoice.xml',
|
2025-04-04 12:14:41 +00:00
|
|
|
'xrechnung.xml',
|
|
|
|
'ubl-invoice.xml',
|
|
|
|
'invoice.xml',
|
|
|
|
'metadata.xml'
|
2025-04-03 20:08:02 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Known XML formats to validate extracted content
|
|
|
|
*/
|
|
|
|
protected readonly knownFormats = [
|
|
|
|
'CrossIndustryInvoice',
|
|
|
|
'CrossIndustryDocument',
|
|
|
|
'Invoice',
|
|
|
|
'CreditNote',
|
|
|
|
'ubl:Invoice',
|
|
|
|
'ubl:CreditNote',
|
|
|
|
'rsm:CrossIndustryInvoice',
|
|
|
|
'rsm:CrossIndustryDocument',
|
|
|
|
'ram:CrossIndustryDocument',
|
|
|
|
'urn:un:unece:uncefact',
|
|
|
|
'urn:ferd:CrossIndustryDocument',
|
|
|
|
'urn:zugferd',
|
|
|
|
'urn:factur-x',
|
|
|
|
'factur-x.eu',
|
2025-04-04 12:14:41 +00:00
|
|
|
'ZUGFeRD',
|
|
|
|
'FatturaElettronica'
|
2025-04-03 20:08:02 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Known XML end tags for extracting content from strings
|
|
|
|
*/
|
|
|
|
protected readonly knownEndTags = [
|
|
|
|
'</CrossIndustryInvoice>',
|
|
|
|
'</CrossIndustryDocument>',
|
|
|
|
'</Invoice>',
|
|
|
|
'</CreditNote>',
|
|
|
|
'</rsm:CrossIndustryInvoice>',
|
|
|
|
'</rsm:CrossIndustryDocument>',
|
|
|
|
'</ram:CrossIndustryDocument>',
|
|
|
|
'</ubl:Invoice>',
|
2025-04-04 12:14:41 +00:00
|
|
|
'</ubl:CreditNote>',
|
|
|
|
'</FatturaElettronica>'
|
2025-04-03 20:08:02 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extract XML from a PDF buffer
|
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if an XML string is valid
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML is valid
|
|
|
|
*/
|
|
|
|
protected isValidXml(xmlString: string): boolean {
|
|
|
|
try {
|
|
|
|
// Basic checks for XML validity
|
|
|
|
if (!xmlString || typeof xmlString !== 'string') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
// Check if it starts with XML declaration or a valid element
|
|
|
|
if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
|
2025-04-03 20:08:02 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string contains known invoice formats
|
2025-04-04 12:14:41 +00:00
|
|
|
const hasKnownFormat = this.hasKnownFormat(xmlString);
|
2025-04-03 20:08:02 +00:00
|
|
|
if (!hasKnownFormat) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string contains binary data or invalid characters
|
2025-04-04 12:14:41 +00:00
|
|
|
if (this.hasBinaryData(xmlString)) {
|
2025-04-03 20:08:02 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the XML string is too short
|
|
|
|
if (xmlString.length < 100) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
// Check if XML has a proper structure (contains both opening and closing tags)
|
|
|
|
if (!this.hasProperXmlStructure(xmlString)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
return true;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error validating XML:', error);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
/**
|
|
|
|
* Check if the XML string contains a known element
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML contains a known element
|
|
|
|
*/
|
|
|
|
protected hasKnownXmlElement(xmlString: string): boolean {
|
|
|
|
for (const format of this.knownFormats) {
|
|
|
|
// Check for opening tag of format
|
|
|
|
if (xmlString.includes(`<${format}`)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if the XML string contains a known format
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML contains a known format
|
|
|
|
*/
|
|
|
|
protected hasKnownFormat(xmlString: string): boolean {
|
|
|
|
for (const format of this.knownFormats) {
|
|
|
|
if (xmlString.includes(format)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if the XML string has a proper structure
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML has a proper structure
|
|
|
|
*/
|
|
|
|
protected hasProperXmlStructure(xmlString: string): boolean {
|
|
|
|
// Check for at least one matching opening and closing tag
|
|
|
|
for (const endTag of this.knownEndTags) {
|
|
|
|
const startTag = endTag.replace('/', '');
|
|
|
|
if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no specific tag is found but it has a basic XML structure
|
|
|
|
return (
|
|
|
|
(xmlString.includes('<?xml') && xmlString.includes('?>')) ||
|
|
|
|
(xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if the XML string contains binary data
|
|
|
|
* @param xmlString XML string to check
|
|
|
|
* @returns True if the XML contains binary data
|
|
|
|
*/
|
|
|
|
protected hasBinaryData(xmlString: string): boolean {
|
|
|
|
// Check for common binary data indicators
|
|
|
|
const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
|
|
|
const consecutiveNulls = '\u0000\u0000\u0000';
|
|
|
|
|
|
|
|
// Check for control characters that shouldn't be in XML
|
|
|
|
if (binaryChars.some(char => xmlString.includes(char))) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for consecutive null bytes which indicate binary data
|
|
|
|
if (xmlString.includes(consecutiveNulls)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for high concentration of non-printable characters
|
|
|
|
const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
|
|
|
|
if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
/**
|
|
|
|
* Extract XML from a string
|
|
|
|
* @param text Text to extract XML from
|
|
|
|
* @param startIndex Index to start extraction from
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
|
|
|
|
try {
|
|
|
|
// Find the start of the XML document
|
2025-04-04 12:14:41 +00:00
|
|
|
let xmlStartIndex = text.indexOf('<?xml', startIndex);
|
|
|
|
|
|
|
|
// If no XML declaration, try to find known elements
|
2025-04-03 20:08:02 +00:00
|
|
|
if (xmlStartIndex === -1) {
|
2025-04-04 12:14:41 +00:00
|
|
|
for (const format of this.knownFormats) {
|
|
|
|
const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
|
|
|
|
if (formatStartIndex !== -1) {
|
|
|
|
xmlStartIndex = formatStartIndex;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Still didn't find any start marker
|
|
|
|
if (xmlStartIndex === -1) {
|
|
|
|
return null;
|
|
|
|
}
|
2025-04-03 20:08:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Try to find the end of the XML document
|
|
|
|
let xmlEndIndex = -1;
|
|
|
|
for (const endTag of this.knownEndTags) {
|
|
|
|
const endIndex = text.indexOf(endTag, xmlStartIndex);
|
|
|
|
if (endIndex !== -1) {
|
|
|
|
xmlEndIndex = endIndex + endTag.length;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
// If no known end tag found, try to use a heuristic approach
|
2025-04-03 20:08:02 +00:00
|
|
|
if (xmlEndIndex === -1) {
|
2025-04-04 12:14:41 +00:00
|
|
|
// Try to find the last closing tag
|
|
|
|
const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
|
|
|
|
if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
|
|
|
|
xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
|
|
|
|
} else {
|
|
|
|
return null;
|
|
|
|
}
|
2025-04-03 20:08:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Extract the XML content
|
2025-04-04 12:14:41 +00:00
|
|
|
const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
|
|
|
|
|
|
|
|
// Validate the extracted content
|
|
|
|
if (this.isValidXml(xmlContent)) {
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
2025-04-03 20:08:02 +00:00
|
|
|
} catch (error) {
|
|
|
|
console.error('Error extracting XML from string:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Decompress and decode XML content from a PDF stream
|
|
|
|
* @param stream PDF stream containing XML data
|
|
|
|
* @param fileName Name of the file (for logging)
|
|
|
|
* @returns XML content or null if not valid
|
|
|
|
*/
|
|
|
|
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
|
|
|
|
try {
|
2025-04-04 12:14:41 +00:00
|
|
|
// Get the raw bytes from the stream
|
|
|
|
const rawBytes = stream.getContents();
|
|
|
|
|
|
|
|
// First try without decompression (in case the content is not compressed)
|
|
|
|
let xmlContent = this.tryDecodeBuffer(rawBytes);
|
|
|
|
if (xmlContent && this.isValidXml(xmlContent)) {
|
|
|
|
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try with decompression
|
2025-04-03 20:08:02 +00:00
|
|
|
try {
|
2025-04-04 12:14:41 +00:00
|
|
|
const decompressedBytes = this.tryDecompress(rawBytes);
|
|
|
|
if (decompressedBytes) {
|
|
|
|
xmlContent = this.tryDecodeBuffer(decompressedBytes);
|
|
|
|
if (xmlContent && this.isValidXml(xmlContent)) {
|
|
|
|
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
2025-04-03 20:08:02 +00:00
|
|
|
}
|
|
|
|
} catch (decompressError) {
|
2025-04-04 12:14:41 +00:00
|
|
|
console.log(`Decompression failed for ${fileName}: ${decompressError}`);
|
2025-04-03 20:08:02 +00:00
|
|
|
}
|
2025-04-04 12:14:41 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
return null;
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error extracting XML from stream:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2025-04-04 12:14:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to decompress a buffer using different methods
|
|
|
|
* @param buffer Buffer to decompress
|
|
|
|
* @returns Decompressed buffer or null if decompression failed
|
|
|
|
*/
|
|
|
|
protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
|
|
|
|
try {
|
|
|
|
// Try pako inflate (for deflate/zlib compression)
|
|
|
|
return pako.inflate(buffer);
|
|
|
|
} catch (error) {
|
|
|
|
// If pako fails, try other methods if needed
|
|
|
|
console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to decode a buffer to a string using different encodings
|
|
|
|
* @param buffer Buffer to decode
|
|
|
|
* @returns Decoded string or null if decoding failed
|
|
|
|
*/
|
|
|
|
protected tryDecodeBuffer(buffer: Uint8Array): string | null {
|
|
|
|
try {
|
|
|
|
// Try UTF-8 first
|
|
|
|
let content = new TextDecoder('utf-8').decode(buffer);
|
|
|
|
if (this.isPlausibleXml(content)) {
|
|
|
|
return content;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try ISO-8859-1 (Latin1)
|
|
|
|
content = this.decodeLatin1(buffer);
|
|
|
|
if (this.isPlausibleXml(content)) {
|
|
|
|
return content;
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
} catch (error) {
|
|
|
|
console.warn('Error decoding buffer:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Decode a buffer using ISO-8859-1 (Latin1) encoding
|
|
|
|
* @param buffer Buffer to decode
|
|
|
|
* @returns Decoded string
|
|
|
|
*/
|
|
|
|
protected decodeLatin1(buffer: Uint8Array): string {
|
|
|
|
return Array.from(buffer)
|
|
|
|
.map(byte => String.fromCharCode(byte))
|
|
|
|
.join('');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if a string is plausibly XML (quick check before validation)
|
|
|
|
* @param content String to check
|
|
|
|
* @returns True if the string is plausibly XML
|
|
|
|
*/
|
|
|
|
protected isPlausibleXml(content: string): boolean {
|
|
|
|
return content.includes('<') &&
|
|
|
|
content.includes('>') &&
|
|
|
|
(content.includes('<?xml') ||
|
|
|
|
this.knownFormats.some(format => content.includes(format)));
|
|
|
|
}
|
|
|
|
}
|