2025-04-03 20:08:02 +00:00
|
|
|
import { BaseXMLExtractor } from './base.extractor.js';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Text-based XML extractor for PDF documents
|
|
|
|
* Extracts XML by searching for XML patterns in the PDF text
|
|
|
|
* Used as a fallback when other extraction methods fail
|
|
|
|
*/
|
|
|
|
export class TextXMLExtractor extends BaseXMLExtractor {
|
2025-04-04 12:14:41 +00:00
|
|
|
// Maximum chunk size to process at once (4MB)
|
|
|
|
private readonly CHUNK_SIZE = 4 * 1024 * 1024;
|
|
|
|
|
|
|
|
// Maximum number of chunks to check (effective 20MB search limit)
|
|
|
|
private readonly MAX_CHUNKS = 5;
|
|
|
|
|
|
|
|
// Common XML patterns to look for
|
|
|
|
private readonly XML_PATTERNS = [
|
|
|
|
'<?xml',
|
|
|
|
'<CrossIndustryInvoice',
|
|
|
|
'<CrossIndustryDocument',
|
|
|
|
'<Invoice',
|
|
|
|
'<CreditNote',
|
|
|
|
'<rsm:CrossIndustryInvoice',
|
|
|
|
'<rsm:CrossIndustryDocument',
|
|
|
|
'<ram:CrossIndustryDocument',
|
|
|
|
'<ubl:Invoice',
|
|
|
|
'<ubl:CreditNote',
|
|
|
|
'<FatturaElettronica'
|
|
|
|
];
|
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
/**
|
|
|
|
* Extract XML from a PDF buffer by searching for XML patterns in the text
|
2025-04-04 12:14:41 +00:00
|
|
|
* Uses a chunked approach to handle large files efficiently
|
2025-04-03 20:08:02 +00:00
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
|
|
|
try {
|
2025-04-04 12:14:41 +00:00
|
|
|
console.log('Attempting text-based XML extraction from PDF...');
|
|
|
|
|
|
|
|
// Convert Buffer to Uint8Array if needed
|
|
|
|
const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
|
|
|
|
|
|
|
// Try extracting XML using the chunked approach
|
|
|
|
return this.extractXmlFromBufferChunked(buffer);
|
|
|
|
} catch (error) {
|
|
|
|
console.error('Error in text-based extraction:', error);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2025-04-03 20:08:02 +00:00
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
/**
|
|
|
|
* Extract XML from buffer using a chunked approach
|
|
|
|
* This helps avoid memory issues with large PDFs
|
|
|
|
* @param buffer Buffer to search in
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
|
|
|
|
// Process the PDF in chunks
|
|
|
|
for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
|
|
|
|
const startPos = chunkIndex * this.CHUNK_SIZE;
|
|
|
|
if (startPos >= buffer.length) break;
|
|
|
|
|
|
|
|
const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
|
|
|
|
const chunk = buffer.slice(startPos, endPos);
|
|
|
|
|
|
|
|
// Try to extract XML from this chunk
|
|
|
|
const chunkResult = this.processChunk(chunk, startPos);
|
|
|
|
if (chunkResult) {
|
|
|
|
return chunkResult;
|
2025-04-03 20:08:02 +00:00
|
|
|
}
|
2025-04-04 12:14:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
console.warn('No valid XML found in any chunk of the PDF');
|
|
|
|
return null;
|
|
|
|
}
|
2025-04-03 20:08:02 +00:00
|
|
|
|
2025-04-04 12:14:41 +00:00
|
|
|
/**
|
|
|
|
* Process a single chunk of the PDF buffer
|
|
|
|
* @param chunk Chunk buffer to process
|
|
|
|
* @param chunkOffset Offset position of the chunk in the original buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
|
|
|
|
try {
|
|
|
|
// First try UTF-8 encoding for this chunk
|
|
|
|
const utf8String = this.decodeBufferToString(chunk, 'utf-8');
|
|
|
|
let xmlContent = this.searchForXmlInString(utf8String);
|
|
|
|
|
|
|
|
if (xmlContent) {
|
|
|
|
console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
|
|
|
|
const latin1String = this.decodeBufferToString(chunk, 'latin1');
|
|
|
|
xmlContent = this.searchForXmlInString(latin1String);
|
|
|
|
|
|
|
|
if (xmlContent) {
|
|
|
|
console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No XML found in this chunk
|
2025-04-03 20:08:02 +00:00
|
|
|
return null;
|
|
|
|
} catch (error) {
|
2025-04-04 12:14:41 +00:00
|
|
|
console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
|
2025-04-03 20:08:02 +00:00
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2025-04-04 12:14:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Safely decode a buffer to string using the specified encoding
|
|
|
|
* @param buffer Buffer to decode
|
|
|
|
* @param encoding Encoding to use ('utf-8' or 'latin1')
|
|
|
|
* @returns Decoded string
|
|
|
|
*/
|
|
|
|
private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
|
|
|
|
try {
|
|
|
|
if (encoding === 'utf-8') {
|
|
|
|
return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
|
|
|
|
} else {
|
|
|
|
// For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
|
|
|
|
// This is more reliable for binary data than TextDecoder for legacy encodings
|
|
|
|
return Array.from(buffer)
|
|
|
|
.map(byte => String.fromCharCode(byte))
|
|
|
|
.join('');
|
|
|
|
}
|
|
|
|
} catch (error) {
|
|
|
|
console.warn(`Error decoding buffer using ${encoding}:`, error);
|
|
|
|
// Return empty string on error to allow processing to continue
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Search for XML patterns in a string
|
|
|
|
* @param content String to search in
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
|
|
|
private searchForXmlInString(content: string): string | null {
|
|
|
|
if (!content) return null;
|
|
|
|
|
|
|
|
// Search for each XML pattern
|
|
|
|
for (const pattern of this.XML_PATTERNS) {
|
|
|
|
const patternIndex = content.indexOf(pattern);
|
|
|
|
if (patternIndex !== -1) {
|
|
|
|
console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
|
|
|
|
|
|
|
|
// Try to extract the XML content starting from the pattern position
|
|
|
|
const xmlContent = this.extractXmlFromString(content, patternIndex);
|
|
|
|
|
|
|
|
// Validate the extracted content
|
|
|
|
if (xmlContent && this.isValidXml(xmlContent)) {
|
|
|
|
console.log('Successfully extracted and validated XML from text');
|
|
|
|
return xmlContent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|