141 lines
4.1 KiB
TypeScript
141 lines
4.1 KiB
TypeScript
import {
|
|
BaseXMLExtractor,
|
|
StandardXMLExtractor,
|
|
AssociatedFilesExtractor,
|
|
TextXMLExtractor
|
|
} from './extractors/index.js';
|
|
import { FormatDetector } from '../utils/format.detector.js';
|
|
import { InvoiceFormat } from '../../interfaces/common.js';
|
|
|
|
/**
|
|
* Error types for PDF extraction operations
|
|
*/
|
|
export enum PDFExtractError {
|
|
EXTRACT_ERROR = 'XML extraction failed',
|
|
INVALID_INPUT = 'Invalid input parameters',
|
|
NO_XML_FOUND = 'No XML found in PDF'
|
|
}
|
|
|
|
/**
|
|
* Result of a PDF extraction operation
|
|
*/
|
|
export interface PDFExtractResult {
|
|
success: boolean;
|
|
xml?: string;
|
|
format?: InvoiceFormat;
|
|
extractorUsed?: string;
|
|
error?: {
|
|
type: PDFExtractError;
|
|
message: string;
|
|
originalError?: Error;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Main PDF extractor class that orchestrates the extraction process
|
|
* Uses multiple specialized extractors in sequence to maximize success rate
|
|
*/
|
|
export class PDFExtractor {
|
|
private extractors: BaseXMLExtractor[] = [];
|
|
|
|
/**
|
|
* Constructor initializes the chain of extractors
|
|
*/
|
|
constructor() {
|
|
// Add extractors in order of preference/likelihood of success
|
|
this.extractors.push(
|
|
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
|
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
|
new TextXMLExtractor() // Text-based extraction (fallback)
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Extract XML from a PDF buffer
|
|
* Tries multiple extraction methods in sequence
|
|
* @param pdfBuffer PDF buffer
|
|
* @returns Result with either the extracted XML or error information
|
|
*/
|
|
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
|
|
try {
|
|
console.log('Starting XML extraction from PDF...');
|
|
|
|
// Validate input
|
|
if (!pdfBuffer || pdfBuffer.length === 0) {
|
|
return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
|
|
}
|
|
|
|
// Ensure buffer is Uint8Array
|
|
const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
|
|
|
// Try each extractor in sequence
|
|
for (const extractor of this.extractors) {
|
|
const extractorName = extractor.constructor.name;
|
|
console.log(`Trying extraction with ${extractorName}...`);
|
|
|
|
try {
|
|
const xml = await extractor.extractXml(pdfBufferArray);
|
|
|
|
if (xml) {
|
|
console.log(`Successfully extracted XML using ${extractorName}`);
|
|
|
|
// Detect format of the extracted XML
|
|
const format = FormatDetector.detectFormat(xml);
|
|
|
|
return {
|
|
success: true,
|
|
xml,
|
|
format,
|
|
extractorUsed: extractorName
|
|
};
|
|
}
|
|
|
|
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
|
} catch (error) {
|
|
// Log error but continue with next extractor
|
|
console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
|
|
}
|
|
}
|
|
|
|
// If all extractors fail, return a no XML found error
|
|
return this.createErrorResult(
|
|
PDFExtractError.NO_XML_FOUND,
|
|
'All extraction methods failed, no valid XML found in PDF'
|
|
);
|
|
} catch (error) {
|
|
// Handle any unexpected errors
|
|
return this.createErrorResult(
|
|
PDFExtractError.EXTRACT_ERROR,
|
|
`Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
|
|
error instanceof Error ? error : undefined
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create a PDF extract result with error information
|
|
* @param type Error type
|
|
* @param message Error message
|
|
* @param originalError Original error object
|
|
* @returns Error result
|
|
*/
|
|
private createErrorResult(
|
|
type: PDFExtractError,
|
|
message: string,
|
|
originalError?: Error
|
|
): PDFExtractResult {
|
|
console.error(`PDF Extractor Error (${type}): ${message}`);
|
|
if (originalError) {
|
|
console.error(originalError);
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: {
|
|
type,
|
|
message,
|
|
originalError
|
|
}
|
|
};
|
|
}
|
|
} |