2025-04-03 20:08:02 +00:00
|
|
|
import {
|
|
|
|
BaseXMLExtractor,
|
|
|
|
StandardXMLExtractor,
|
|
|
|
AssociatedFilesExtractor,
|
|
|
|
TextXMLExtractor
|
|
|
|
} from './extractors/index.js';
|
2025-04-03 15:53:08 +00:00
|
|
|
|
|
|
|
/**
|
2025-04-03 20:08:02 +00:00
|
|
|
* Main PDF extractor class that orchestrates the extraction process
|
|
|
|
* Uses multiple specialized extractors in sequence to maximize success rate
|
2025-04-03 15:53:08 +00:00
|
|
|
*/
|
|
|
|
export class PDFExtractor {
|
2025-04-03 20:08:02 +00:00
|
|
|
private extractors: BaseXMLExtractor[] = [];
|
2025-04-03 17:21:36 +00:00
|
|
|
|
|
|
|
/**
|
2025-04-03 20:08:02 +00:00
|
|
|
* Constructor initializes the chain of extractors
|
2025-04-03 17:21:36 +00:00
|
|
|
*/
|
2025-04-03 20:08:02 +00:00
|
|
|
constructor() {
|
|
|
|
// Add extractors in order of preference/likelihood of success
|
|
|
|
this.extractors.push(
|
|
|
|
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
|
|
|
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
|
|
|
new TextXMLExtractor() // Text-based extraction (fallback)
|
|
|
|
);
|
2025-04-03 17:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2025-04-03 20:08:02 +00:00
|
|
|
* Extract XML from a PDF buffer
|
|
|
|
* Tries multiple extraction methods in sequence
|
2025-04-03 17:21:36 +00:00
|
|
|
* @param pdfBuffer PDF buffer
|
|
|
|
* @returns XML content or null if not found
|
|
|
|
*/
|
2025-04-03 20:08:02 +00:00
|
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
2025-04-03 17:21:36 +00:00
|
|
|
try {
|
2025-04-03 20:08:02 +00:00
|
|
|
console.log('Starting XML extraction from PDF...');
|
2025-04-03 17:21:36 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// Try each extractor in sequence
|
|
|
|
for (const extractor of this.extractors) {
|
|
|
|
const extractorName = extractor.constructor.name;
|
|
|
|
console.log(`Trying extraction with ${extractorName}...`);
|
2025-04-03 17:21:36 +00:00
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
const xml = await extractor.extractXml(pdfBuffer);
|
|
|
|
if (xml) {
|
|
|
|
console.log(`Successfully extracted XML using ${extractorName}`);
|
|
|
|
return xml;
|
2025-04-03 17:21:36 +00:00
|
|
|
}
|
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
2025-04-03 17:21:36 +00:00
|
|
|
}
|
|
|
|
|
2025-04-03 20:08:02 +00:00
|
|
|
// If all extractors fail, return null
|
|
|
|
console.warn('All extraction methods failed, no valid XML found in PDF');
|
|
|
|
return null;
|
2025-04-03 17:21:36 +00:00
|
|
|
} catch (error) {
|
2025-04-03 20:08:02 +00:00
|
|
|
console.error('Error extracting XML from PDF:', error);
|
2025-04-03 16:41:10 +00:00
|
|
|
return null;
|
2025-04-03 15:53:08 +00:00
|
|
|
}
|
|
|
|
}
|
2025-04-03 17:21:36 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-04-03 15:53:08 +00:00
|
|
|
}
|