import { BaseXMLExtractor } from './base.extractor.js'; /** * Text-based XML extractor for PDF documents * Extracts XML by searching for XML patterns in the PDF text * Used as a fallback when other extraction methods fail */ export class TextXMLExtractor extends BaseXMLExtractor { // Maximum chunk size to process at once (4MB) private readonly CHUNK_SIZE = 4 * 1024 * 1024; // Maximum number of chunks to check (effective 20MB search limit) private readonly MAX_CHUNKS = 5; // Common XML patterns to look for private readonly XML_PATTERNS = [ ' { try { console.log('Attempting text-based XML extraction from PDF...'); // Convert Buffer to Uint8Array if needed const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer; // Try extracting XML using the chunked approach return this.extractXmlFromBufferChunked(buffer); } catch (error) { console.error('Error in text-based extraction:', error); return null; } } /** * Extract XML from buffer using a chunked approach * This helps avoid memory issues with large PDFs * @param buffer Buffer to search in * @returns XML content or null if not found */ private extractXmlFromBufferChunked(buffer: Uint8Array): string | null { // Process the PDF in chunks for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) { const startPos = chunkIndex * this.CHUNK_SIZE; if (startPos >= buffer.length) break; const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length); const chunk = buffer.slice(startPos, endPos); // Try to extract XML from this chunk const chunkResult = this.processChunk(chunk, startPos); if (chunkResult) { return chunkResult; } } console.warn('No valid XML found in any chunk of the PDF'); return null; } /** * Process a single chunk of the PDF buffer * @param chunk Chunk buffer to process * @param chunkOffset Offset position of the chunk in the original buffer * @returns XML content or null if not found */ private processChunk(chunk: Uint8Array, chunkOffset: number): string | null { try { // First try UTF-8 encoding for this chunk const utf8String = this.decodeBufferToString(chunk, 'utf-8'); let xmlContent = this.searchForXmlInString(utf8String); if (xmlContent) { console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`); return xmlContent; } // If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better const latin1String = this.decodeBufferToString(chunk, 'latin1'); xmlContent = this.searchForXmlInString(latin1String); if (xmlContent) { console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`); return xmlContent; } // No XML found in this chunk return null; } catch (error) { console.warn(`Error processing chunk at offset ${chunkOffset}:`, error); return null; } } /** * Safely decode a buffer to string using the specified encoding * @param buffer Buffer to decode * @param encoding Encoding to use ('utf-8' or 'latin1') * @returns Decoded string */ private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string { try { if (encoding === 'utf-8') { return new TextDecoder('utf-8', { fatal: false }).decode(buffer); } else { // For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255) // This is more reliable for binary data than TextDecoder for legacy encodings return Array.from(buffer) .map(byte => String.fromCharCode(byte)) .join(''); } } catch (error) { console.warn(`Error decoding buffer using ${encoding}:`, error); // Return empty string on error to allow processing to continue return ''; } } /** * Search for XML patterns in a string * @param content String to search in * @returns XML content or null if not found */ private searchForXmlInString(content: string): string | null { if (!content) return null; // Search for each XML pattern for (const pattern of this.XML_PATTERNS) { const patternIndex = content.indexOf(pattern); if (patternIndex !== -1) { console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`); // Try to extract the XML content starting from the pattern position const xmlContent = this.extractXmlFromString(content, patternIndex); // Validate the extracted content if (xmlContent && this.isValidXml(xmlContent)) { console.log('Successfully extracted and validated XML from text'); return xmlContent; } } } return null; } }