xinvoice/ts/formats/pdf/pdf.extractor.ts

141 lines
4.1 KiB
TypeScript
Raw Normal View History

import {
BaseXMLExtractor,
StandardXMLExtractor,
AssociatedFilesExtractor,
TextXMLExtractor
} from './extractors/index.js';
import { FormatDetector } from '../utils/format.detector.js';
import { InvoiceFormat } from '../../interfaces/common.js';
/**
* Error types for PDF extraction operations
*/
export enum PDFExtractError {
EXTRACT_ERROR = 'XML extraction failed',
INVALID_INPUT = 'Invalid input parameters',
NO_XML_FOUND = 'No XML found in PDF'
}
/**
* Result of a PDF extraction operation
*/
export interface PDFExtractResult {
success: boolean;
xml?: string;
format?: InvoiceFormat;
extractorUsed?: string;
error?: {
type: PDFExtractError;
message: string;
originalError?: Error;
};
}
2025-04-03 15:53:08 +00:00
/**
* Main PDF extractor class that orchestrates the extraction process
* Uses multiple specialized extractors in sequence to maximize success rate
2025-04-03 15:53:08 +00:00
*/
export class PDFExtractor {
private extractors: BaseXMLExtractor[] = [];
2025-04-03 17:21:36 +00:00
/**
* Constructor initializes the chain of extractors
2025-04-03 17:21:36 +00:00
*/
constructor() {
// Add extractors in order of preference/likelihood of success
this.extractors.push(
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
new TextXMLExtractor() // Text-based extraction (fallback)
);
2025-04-03 17:21:36 +00:00
}
/**
* Extract XML from a PDF buffer
* Tries multiple extraction methods in sequence
2025-04-03 17:21:36 +00:00
* @param pdfBuffer PDF buffer
* @returns Result with either the extracted XML or error information
2025-04-03 17:21:36 +00:00
*/
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
2025-04-03 17:21:36 +00:00
try {
console.log('Starting XML extraction from PDF...');
2025-04-03 17:21:36 +00:00
// Validate input
if (!pdfBuffer || pdfBuffer.length === 0) {
return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
}
// Ensure buffer is Uint8Array
const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
// Try each extractor in sequence
for (const extractor of this.extractors) {
const extractorName = extractor.constructor.name;
console.log(`Trying extraction with ${extractorName}...`);
2025-04-03 17:21:36 +00:00
try {
const xml = await extractor.extractXml(pdfBufferArray);
if (xml) {
console.log(`Successfully extracted XML using ${extractorName}`);
// Detect format of the extracted XML
const format = FormatDetector.detectFormat(xml);
return {
success: true,
xml,
format,
extractorUsed: extractorName
};
}
console.log(`Extraction with ${extractorName} failed, trying next method...`);
} catch (error) {
// Log error but continue with next extractor
console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
2025-04-03 17:21:36 +00:00
}
}
// If all extractors fail, return a no XML found error
return this.createErrorResult(
PDFExtractError.NO_XML_FOUND,
'All extraction methods failed, no valid XML found in PDF'
);
2025-04-03 17:21:36 +00:00
} catch (error) {
// Handle any unexpected errors
return this.createErrorResult(
PDFExtractError.EXTRACT_ERROR,
`Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
error instanceof Error ? error : undefined
);
2025-04-03 15:53:08 +00:00
}
}
2025-04-03 17:21:36 +00:00
/**
* Create a PDF extract result with error information
* @param type Error type
* @param message Error message
* @param originalError Original error object
* @returns Error result
*/
private createErrorResult(
type: PDFExtractError,
message: string,
originalError?: Error
): PDFExtractResult {
console.error(`PDF Extractor Error (${type}): ${message}`);
if (originalError) {
console.error(originalError);
}
return {
success: false,
error: {
type,
message,
originalError
}
};
}
}