fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

This commit is contained in:
2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions

View File

@ -11,7 +11,10 @@ export abstract class BaseXMLExtractor {
'factur-x.xml',
'zugferd-invoice.xml',
'ZUGFeRD-invoice.xml',
'xrechnung.xml'
'xrechnung.xml',
'ubl-invoice.xml',
'invoice.xml',
'metadata.xml'
];
/**
@ -32,7 +35,8 @@ export abstract class BaseXMLExtractor {
'urn:zugferd',
'urn:factur-x',
'factur-x.eu',
'ZUGFeRD'
'ZUGFeRD',
'FatturaElettronica'
];
/**
@ -47,7 +51,8 @@ export abstract class BaseXMLExtractor {
'</rsm:CrossIndustryDocument>',
'</ram:CrossIndustryDocument>',
'</ubl:Invoice>',
'</ubl:CreditNote>'
'</ubl:CreditNote>',
'</FatturaElettronica>'
];
/**
@ -69,21 +74,19 @@ export abstract class BaseXMLExtractor {
return false;
}
// Check if it starts with XML declaration
if (!xmlString.includes('<?xml')) {
// Check if it starts with XML declaration or a valid element
if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
return false;
}
// Check if the XML string contains known invoice formats
const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
const hasKnownFormat = this.hasKnownFormat(xmlString);
if (!hasKnownFormat) {
return false;
}
// Check if the XML string contains binary data or invalid characters
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
if (hasBinaryData) {
if (this.hasBinaryData(xmlString)) {
return false;
}
@ -92,6 +95,11 @@ export abstract class BaseXMLExtractor {
return false;
}
// Check if XML has a proper structure (contains both opening and closing tags)
if (!this.hasProperXmlStructure(xmlString)) {
return false;
}
return true;
} catch (error) {
console.error('Error validating XML:', error);
@ -99,6 +107,85 @@ export abstract class BaseXMLExtractor {
}
}
/**
* Check if the XML string contains a known element
* @param xmlString XML string to check
* @returns True if the XML contains a known element
*/
protected hasKnownXmlElement(xmlString: string): boolean {
for (const format of this.knownFormats) {
// Check for opening tag of format
if (xmlString.includes(`<${format}`)) {
return true;
}
}
return false;
}
/**
* Check if the XML string contains a known format
* @param xmlString XML string to check
* @returns True if the XML contains a known format
*/
protected hasKnownFormat(xmlString: string): boolean {
for (const format of this.knownFormats) {
if (xmlString.includes(format)) {
return true;
}
}
return false;
}
/**
* Check if the XML string has a proper structure
* @param xmlString XML string to check
* @returns True if the XML has a proper structure
*/
protected hasProperXmlStructure(xmlString: string): boolean {
// Check for at least one matching opening and closing tag
for (const endTag of this.knownEndTags) {
const startTag = endTag.replace('/', '');
if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
return true;
}
}
// If no specific tag is found but it has a basic XML structure
return (
(xmlString.includes('<?xml') && xmlString.includes('?>')) ||
(xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
);
}
/**
* Check if the XML string contains binary data
* @param xmlString XML string to check
* @returns True if the XML contains binary data
*/
protected hasBinaryData(xmlString: string): boolean {
// Check for common binary data indicators
const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
const consecutiveNulls = '\u0000\u0000\u0000';
// Check for control characters that shouldn't be in XML
if (binaryChars.some(char => xmlString.includes(char))) {
return true;
}
// Check for consecutive null bytes which indicate binary data
if (xmlString.includes(consecutiveNulls)) {
return true;
}
// Check for high concentration of non-printable characters
const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
return true;
}
return false;
}
/**
* Extract XML from a string
* @param text Text to extract XML from
@ -108,9 +195,22 @@ export abstract class BaseXMLExtractor {
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
try {
// Find the start of the XML document
const xmlStartIndex = text.indexOf('<?xml', startIndex);
let xmlStartIndex = text.indexOf('<?xml', startIndex);
// If no XML declaration, try to find known elements
if (xmlStartIndex === -1) {
return null;
for (const format of this.knownFormats) {
const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
if (formatStartIndex !== -1) {
xmlStartIndex = formatStartIndex;
break;
}
}
// Still didn't find any start marker
if (xmlStartIndex === -1) {
return null;
}
}
// Try to find the end of the XML document
@ -123,12 +223,26 @@ export abstract class BaseXMLExtractor {
}
}
// If no known end tag found, try to use a heuristic approach
if (xmlEndIndex === -1) {
return null;
// Try to find the last closing tag
const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
} else {
return null;
}
}
// Extract the XML content
return text.substring(xmlStartIndex, xmlEndIndex);
const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
// Validate the extracted content
if (this.isValidXml(xmlContent)) {
return xmlContent;
}
return null;
} catch (error) {
console.error('Error extracting XML from string:', error);
return null;
@ -143,34 +257,99 @@ export abstract class BaseXMLExtractor {
*/
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
try {
// Try to decompress with pako
const compressedBytes = stream.getContents().buffer;
// Get the raw bytes from the stream
const rawBytes = stream.getContents();
// First try without decompression (in case the content is not compressed)
let xmlContent = this.tryDecodeBuffer(rawBytes);
if (xmlContent && this.isValidXml(xmlContent)) {
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
return xmlContent;
}
// Try with decompression
try {
const decompressedBytes = pako.inflate(compressedBytes);
const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
if (this.isValidXml(xmlContent)) {
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
return xmlContent;
const decompressedBytes = this.tryDecompress(rawBytes);
if (decompressedBytes) {
xmlContent = this.tryDecodeBuffer(decompressedBytes);
if (xmlContent && this.isValidXml(xmlContent)) {
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
return xmlContent;
}
}
} catch (decompressError) {
// Decompression failed, try without decompression
console.log(`Decompression failed for ${fileName}, trying without decompression...`);
console.log(`Decompression failed for ${fileName}: ${decompressError}`);
}
// Try without decompression
const rawBytes = stream.getContents();
const rawContent = new TextDecoder('utf-8').decode(rawBytes);
if (this.isValidXml(rawContent)) {
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
return rawContent;
}
return null;
} catch (error) {
console.error('Error extracting XML from stream:', error);
return null;
}
}
}
/**
* Try to decompress a buffer using different methods
* @param buffer Buffer to decompress
* @returns Decompressed buffer or null if decompression failed
*/
protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
try {
// Try pako inflate (for deflate/zlib compression)
return pako.inflate(buffer);
} catch (error) {
// If pako fails, try other methods if needed
console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
return null;
}
}
/**
* Try to decode a buffer to a string using different encodings
* @param buffer Buffer to decode
* @returns Decoded string or null if decoding failed
*/
protected tryDecodeBuffer(buffer: Uint8Array): string | null {
try {
// Try UTF-8 first
let content = new TextDecoder('utf-8').decode(buffer);
if (this.isPlausibleXml(content)) {
return content;
}
// Try ISO-8859-1 (Latin1)
content = this.decodeLatin1(buffer);
if (this.isPlausibleXml(content)) {
return content;
}
return null;
} catch (error) {
console.warn('Error decoding buffer:', error);
return null;
}
}
/**
* Decode a buffer using ISO-8859-1 (Latin1) encoding
* @param buffer Buffer to decode
* @returns Decoded string
*/
protected decodeLatin1(buffer: Uint8Array): string {
return Array.from(buffer)
.map(byte => String.fromCharCode(byte))
.join('');
}
/**
* Check if a string is plausibly XML (quick check before validation)
* @param content String to check
* @returns True if the string is plausibly XML
*/
protected isPlausibleXml(content: string): boolean {
return content.includes('<') &&
content.includes('>') &&
(content.includes('<?xml') ||
this.knownFormats.some(format => content.includes(format)));
}
}

View File

@ -6,50 +6,157 @@ import { BaseXMLExtractor } from './base.extractor.js';
* Used as a fallback when other extraction methods fail
*/
export class TextXMLExtractor extends BaseXMLExtractor {
// Maximum chunk size to process at once (4MB)
private readonly CHUNK_SIZE = 4 * 1024 * 1024;
// Maximum number of chunks to check (effective 20MB search limit)
private readonly MAX_CHUNKS = 5;
// Common XML patterns to look for
private readonly XML_PATTERNS = [
'<?xml',
'<CrossIndustryInvoice',
'<CrossIndustryDocument',
'<Invoice',
'<CreditNote',
'<rsm:CrossIndustryInvoice',
'<rsm:CrossIndustryDocument',
'<ram:CrossIndustryDocument',
'<ubl:Invoice',
'<ubl:CreditNote',
'<FatturaElettronica'
];
/**
* Extract XML from a PDF buffer by searching for XML patterns in the text
* Uses a chunked approach to handle large files efficiently
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
// Convert buffer to string and look for XML patterns
// Increase the search range to handle larger PDFs
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
// Look for common XML patterns in the PDF
const xmlPatterns = [
/<\?xml[^>]*\?>/i,
/<CrossIndustryInvoice[^>]*>/i,
/<CrossIndustryDocument[^>]*>/i,
/<Invoice[^>]*>/i,
/<CreditNote[^>]*>/i,
/<rsm:CrossIndustryInvoice[^>]*>/i,
/<rsm:CrossIndustryDocument[^>]*>/i,
/<ram:CrossIndustryDocument[^>]*>/i,
/<ubl:Invoice[^>]*>/i,
/<ubl:CreditNote[^>]*>/i
];
for (const pattern of xmlPatterns) {
const match = pdfString.match(pattern);
if (match && match.index !== undefined) {
console.log(`Found XML pattern in PDF: ${match[0]}`);
// Try to extract the XML content
const xmlContent = this.extractXmlFromString(pdfString, match.index);
if (xmlContent && this.isValidXml(xmlContent)) {
console.log('Successfully extracted XML from PDF text');
return xmlContent;
}
}
}
console.warn('No valid XML found in PDF text');
return null;
console.log('Attempting text-based XML extraction from PDF...');
// Convert Buffer to Uint8Array if needed
const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
// Try extracting XML using the chunked approach
return this.extractXmlFromBufferChunked(buffer);
} catch (error) {
console.error('Error in text-based extraction:', error);
return null;
}
}
}
/**
* Extract XML from buffer using a chunked approach
* This helps avoid memory issues with large PDFs
* @param buffer Buffer to search in
* @returns XML content or null if not found
*/
private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
// Process the PDF in chunks
for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
const startPos = chunkIndex * this.CHUNK_SIZE;
if (startPos >= buffer.length) break;
const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
const chunk = buffer.slice(startPos, endPos);
// Try to extract XML from this chunk
const chunkResult = this.processChunk(chunk, startPos);
if (chunkResult) {
return chunkResult;
}
}
console.warn('No valid XML found in any chunk of the PDF');
return null;
}
/**
* Process a single chunk of the PDF buffer
* @param chunk Chunk buffer to process
* @param chunkOffset Offset position of the chunk in the original buffer
* @returns XML content or null if not found
*/
private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
try {
// First try UTF-8 encoding for this chunk
const utf8String = this.decodeBufferToString(chunk, 'utf-8');
let xmlContent = this.searchForXmlInString(utf8String);
if (xmlContent) {
console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
return xmlContent;
}
// If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
const latin1String = this.decodeBufferToString(chunk, 'latin1');
xmlContent = this.searchForXmlInString(latin1String);
if (xmlContent) {
console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
return xmlContent;
}
// No XML found in this chunk
return null;
} catch (error) {
console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
return null;
}
}
/**
* Safely decode a buffer to string using the specified encoding
* @param buffer Buffer to decode
* @param encoding Encoding to use ('utf-8' or 'latin1')
* @returns Decoded string
*/
private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
try {
if (encoding === 'utf-8') {
return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
} else {
// For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
// This is more reliable for binary data than TextDecoder for legacy encodings
return Array.from(buffer)
.map(byte => String.fromCharCode(byte))
.join('');
}
} catch (error) {
console.warn(`Error decoding buffer using ${encoding}:`, error);
// Return empty string on error to allow processing to continue
return '';
}
}
/**
* Search for XML patterns in a string
* @param content String to search in
* @returns XML content or null if not found
*/
private searchForXmlInString(content: string): string | null {
if (!content) return null;
// Search for each XML pattern
for (const pattern of this.XML_PATTERNS) {
const patternIndex = content.indexOf(pattern);
if (patternIndex !== -1) {
console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
// Try to extract the XML content starting from the pattern position
const xmlContent = this.extractXmlFromString(content, patternIndex);
// Validate the extracted content
if (xmlContent && this.isValidXml(xmlContent)) {
console.log('Successfully extracted and validated XML from text');
return xmlContent;
}
}
}
return null;
}
}