xinvoice/ts/formats/pdf/pdf.extractor.ts

64 lines
1.8 KiB
TypeScript
Raw Normal View History

import {
BaseXMLExtractor,
StandardXMLExtractor,
AssociatedFilesExtractor,
TextXMLExtractor
} from './extractors/index.js';
2025-04-03 15:53:08 +00:00
/**
* Main PDF extractor class that orchestrates the extraction process
* Uses multiple specialized extractors in sequence to maximize success rate
2025-04-03 15:53:08 +00:00
*/
export class PDFExtractor {
private extractors: BaseXMLExtractor[] = [];
2025-04-03 17:21:36 +00:00
/**
* Constructor initializes the chain of extractors
2025-04-03 17:21:36 +00:00
*/
constructor() {
// Add extractors in order of preference/likelihood of success
this.extractors.push(
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
new TextXMLExtractor() // Text-based extraction (fallback)
);
2025-04-03 17:21:36 +00:00
}
/**
* Extract XML from a PDF buffer
* Tries multiple extraction methods in sequence
2025-04-03 17:21:36 +00:00
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
2025-04-03 17:21:36 +00:00
try {
console.log('Starting XML extraction from PDF...');
2025-04-03 17:21:36 +00:00
// Try each extractor in sequence
for (const extractor of this.extractors) {
const extractorName = extractor.constructor.name;
console.log(`Trying extraction with ${extractorName}...`);
2025-04-03 17:21:36 +00:00
const xml = await extractor.extractXml(pdfBuffer);
if (xml) {
console.log(`Successfully extracted XML using ${extractorName}`);
return xml;
2025-04-03 17:21:36 +00:00
}
console.log(`Extraction with ${extractorName} failed, trying next method...`);
2025-04-03 17:21:36 +00:00
}
// If all extractors fail, return null
console.warn('All extraction methods failed, no valid XML found in PDF');
return null;
2025-04-03 17:21:36 +00:00
} catch (error) {
console.error('Error extracting XML from PDF:', error);
2025-04-03 16:41:10 +00:00
return null;
2025-04-03 15:53:08 +00:00
}
}
2025-04-03 17:21:36 +00:00
2025-04-03 15:53:08 +00:00
}