xinvoice/ts/formats/pdf/pdf.extractor.ts

64 lines
1.8 KiB
TypeScript

import {
BaseXMLExtractor,
StandardXMLExtractor,
AssociatedFilesExtractor,
TextXMLExtractor
} from './extractors/index.js';
/**
* Main PDF extractor class that orchestrates the extraction process
* Uses multiple specialized extractors in sequence to maximize success rate
*/
export class PDFExtractor {
private extractors: BaseXMLExtractor[] = [];
/**
* Constructor initializes the chain of extractors
*/
constructor() {
// Add extractors in order of preference/likelihood of success
this.extractors.push(
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
new TextXMLExtractor() // Text-based extraction (fallback)
);
}
/**
* Extract XML from a PDF buffer
* Tries multiple extraction methods in sequence
* @param pdfBuffer PDF buffer
* @returns XML content or null if not found
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
try {
console.log('Starting XML extraction from PDF...');
// Try each extractor in sequence
for (const extractor of this.extractors) {
const extractorName = extractor.constructor.name;
console.log(`Trying extraction with ${extractorName}...`);
const xml = await extractor.extractXml(pdfBuffer);
if (xml) {
console.log(`Successfully extracted XML using ${extractorName}`);
return xml;
}
console.log(`Extraction with ${extractorName} failed, trying next method...`);
}
// If all extractors fail, return null
console.warn('All extraction methods failed, no valid XML found in PDF');
return null;
} catch (error) {
console.error('Error extracting XML from PDF:', error);
return null;
}
}
}