import { BaseXMLExtractor } from './base.extractor.js'; /** * Text-based XML extractor for PDF documents * Extracts XML by searching for XML patterns in the PDF text * Used as a fallback when other extraction methods fail */ export class TextXMLExtractor extends BaseXMLExtractor { /** * Extract XML from a PDF buffer by searching for XML patterns in the text * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { // Convert buffer to string and look for XML patterns // Increase the search range to handle larger PDFs const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000)); // Look for common XML patterns in the PDF const xmlPatterns = [ /<\?xml[^>]*\?>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i, /]*>/i ]; for (const pattern of xmlPatterns) { const match = pdfString.match(pattern); if (match && match.index !== undefined) { console.log(`Found XML pattern in PDF: ${match[0]}`); // Try to extract the XML content const xmlContent = this.extractXmlFromString(pdfString, match.index); if (xmlContent && this.isValidXml(xmlContent)) { console.log('Successfully extracted XML from PDF text'); return xmlContent; } } } console.warn('No valid XML found in PDF text'); return null; } catch (error) { console.error('Error in text-based extraction:', error); return null; } } }