xinvoice/ts/formats/pdf/extractors/text.extractor.ts

56 lines
1.9 KiB
TypeScript

import { BaseXMLExtractor } from './base.extractor.js';
/**
* Text-based XML extractor for PDF documents
* Extracts XML by searching for XML patterns in the PDF text
* Used as a fallback when other extraction methods fail
*/
export class TextXMLExtractor extends BaseXMLExtractor {
/**
* Extract XML from a PDF buffer by searching for XML patterns in the text
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
// Convert buffer to string and look for XML patterns
// Increase the search range to handle larger PDFs
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
// Look for common XML patterns in the PDF
const xmlPatterns = [
/<\?xml[^>]*\?>/i,
/<CrossIndustryInvoice[^>]*>/i,
/<CrossIndustryDocument[^>]*>/i,
/<Invoice[^>]*>/i,
/<CreditNote[^>]*>/i,
/<rsm:CrossIndustryInvoice[^>]*>/i,
/<rsm:CrossIndustryDocument[^>]*>/i,
/<ram:CrossIndustryDocument[^>]*>/i,
/<ubl:Invoice[^>]*>/i,
/<ubl:CreditNote[^>]*>/i
];
for (const pattern of xmlPatterns) {
const match = pdfString.match(pattern);
if (match && match.index !== undefined) {
console.log(`Found XML pattern in PDF: ${match[0]}`);
// Try to extract the XML content
const xmlContent = this.extractXmlFromString(pdfString, match.index);
if (xmlContent && this.isValidXml(xmlContent)) {
console.log('Successfully extracted XML from PDF text');
return xmlContent;
}
}
}
console.warn('No valid XML found in PDF text');
return null;
} catch (error) {
console.error('Error in text-based extraction:', error);
return null;
}
}
}