56 lines
1.9 KiB
TypeScript
56 lines
1.9 KiB
TypeScript
import { BaseXMLExtractor } from './base.extractor.js';
|
|
|
|
/**
|
|
* Text-based XML extractor for PDF documents
|
|
* Extracts XML by searching for XML patterns in the PDF text
|
|
* Used as a fallback when other extraction methods fail
|
|
*/
|
|
export class TextXMLExtractor extends BaseXMLExtractor {
|
|
/**
|
|
* Extract XML from a PDF buffer by searching for XML patterns in the text
|
|
* @param pdfBuffer PDF buffer
|
|
* @returns XML content or null if not found
|
|
*/
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
|
try {
|
|
// Convert buffer to string and look for XML patterns
|
|
// Increase the search range to handle larger PDFs
|
|
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
|
|
|
|
// Look for common XML patterns in the PDF
|
|
const xmlPatterns = [
|
|
/<\?xml[^>]*\?>/i,
|
|
/<CrossIndustryInvoice[^>]*>/i,
|
|
/<CrossIndustryDocument[^>]*>/i,
|
|
/<Invoice[^>]*>/i,
|
|
/<CreditNote[^>]*>/i,
|
|
/<rsm:CrossIndustryInvoice[^>]*>/i,
|
|
/<rsm:CrossIndustryDocument[^>]*>/i,
|
|
/<ram:CrossIndustryDocument[^>]*>/i,
|
|
/<ubl:Invoice[^>]*>/i,
|
|
/<ubl:CreditNote[^>]*>/i
|
|
];
|
|
|
|
for (const pattern of xmlPatterns) {
|
|
const match = pdfString.match(pattern);
|
|
if (match && match.index !== undefined) {
|
|
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
|
|
|
// Try to extract the XML content
|
|
const xmlContent = this.extractXmlFromString(pdfString, match.index);
|
|
if (xmlContent && this.isValidXml(xmlContent)) {
|
|
console.log('Successfully extracted XML from PDF text');
|
|
return xmlContent;
|
|
}
|
|
}
|
|
}
|
|
|
|
console.warn('No valid XML found in PDF text');
|
|
return null;
|
|
} catch (error) {
|
|
console.error('Error in text-based extraction:', error);
|
|
return null;
|
|
}
|
|
}
|
|
}
|