feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic
This commit is contained in:
55
ts/formats/pdf/extractors/text.extractor.ts
Normal file
55
ts/formats/pdf/extractors/text.extractor.ts
Normal file
@ -0,0 +1,55 @@
|
||||
import { BaseXMLExtractor } from './base.extractor.js';
|
||||
|
||||
/**
|
||||
* Text-based XML extractor for PDF documents
|
||||
* Extracts XML by searching for XML patterns in the PDF text
|
||||
* Used as a fallback when other extraction methods fail
|
||||
*/
|
||||
export class TextXMLExtractor extends BaseXMLExtractor {
|
||||
/**
|
||||
* Extract XML from a PDF buffer by searching for XML patterns in the text
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
// Convert buffer to string and look for XML patterns
|
||||
// Increase the search range to handle larger PDFs
|
||||
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
|
||||
|
||||
// Look for common XML patterns in the PDF
|
||||
const xmlPatterns = [
|
||||
/<\?xml[^>]*\?>/i,
|
||||
/<CrossIndustryInvoice[^>]*>/i,
|
||||
/<CrossIndustryDocument[^>]*>/i,
|
||||
/<Invoice[^>]*>/i,
|
||||
/<CreditNote[^>]*>/i,
|
||||
/<rsm:CrossIndustryInvoice[^>]*>/i,
|
||||
/<rsm:CrossIndustryDocument[^>]*>/i,
|
||||
/<ram:CrossIndustryDocument[^>]*>/i,
|
||||
/<ubl:Invoice[^>]*>/i,
|
||||
/<ubl:CreditNote[^>]*>/i
|
||||
];
|
||||
|
||||
for (const pattern of xmlPatterns) {
|
||||
const match = pdfString.match(pattern);
|
||||
if (match && match.index !== undefined) {
|
||||
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
||||
|
||||
// Try to extract the XML content
|
||||
const xmlContent = this.extractXmlFromString(pdfString, match.index);
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log('Successfully extracted XML from PDF text');
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in PDF text');
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in text-based extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user