fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.
This commit is contained in:
@ -4,6 +4,32 @@ import {
|
||||
AssociatedFilesExtractor,
|
||||
TextXMLExtractor
|
||||
} from './extractors/index.js';
|
||||
import { FormatDetector } from '../utils/format.detector.js';
|
||||
import { InvoiceFormat } from '../../interfaces/common.js';
|
||||
|
||||
/**
|
||||
* Error types for PDF extraction operations
|
||||
*/
|
||||
export enum PDFExtractError {
|
||||
EXTRACT_ERROR = 'XML extraction failed',
|
||||
INVALID_INPUT = 'Invalid input parameters',
|
||||
NO_XML_FOUND = 'No XML found in PDF'
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a PDF extraction operation
|
||||
*/
|
||||
export interface PDFExtractResult {
|
||||
success: boolean;
|
||||
xml?: string;
|
||||
format?: InvoiceFormat;
|
||||
extractorUsed?: string;
|
||||
error?: {
|
||||
type: PDFExtractError;
|
||||
message: string;
|
||||
originalError?: Error;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main PDF extractor class that orchestrates the extraction process
|
||||
@ -18,9 +44,9 @@ export class PDFExtractor {
|
||||
constructor() {
|
||||
// Add extractors in order of preference/likelihood of success
|
||||
this.extractors.push(
|
||||
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
||||
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
||||
new TextXMLExtractor() // Text-based extraction (fallback)
|
||||
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
||||
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
||||
new TextXMLExtractor() // Text-based extraction (fallback)
|
||||
);
|
||||
}
|
||||
|
||||
@ -28,36 +54,88 @@ export class PDFExtractor {
|
||||
* Extract XML from a PDF buffer
|
||||
* Tries multiple extraction methods in sequence
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
* @returns Result with either the extracted XML or error information
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
|
||||
try {
|
||||
console.log('Starting XML extraction from PDF...');
|
||||
|
||||
// Validate input
|
||||
if (!pdfBuffer || pdfBuffer.length === 0) {
|
||||
return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
|
||||
}
|
||||
|
||||
// Ensure buffer is Uint8Array
|
||||
const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
||||
|
||||
// Try each extractor in sequence
|
||||
for (const extractor of this.extractors) {
|
||||
const extractorName = extractor.constructor.name;
|
||||
console.log(`Trying extraction with ${extractorName}...`);
|
||||
|
||||
const xml = await extractor.extractXml(pdfBuffer);
|
||||
if (xml) {
|
||||
console.log(`Successfully extracted XML using ${extractorName}`);
|
||||
return xml;
|
||||
try {
|
||||
const xml = await extractor.extractXml(pdfBufferArray);
|
||||
|
||||
if (xml) {
|
||||
console.log(`Successfully extracted XML using ${extractorName}`);
|
||||
|
||||
// Detect format of the extracted XML
|
||||
const format = FormatDetector.detectFormat(xml);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
xml,
|
||||
format,
|
||||
extractorUsed: extractorName
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
||||
} catch (error) {
|
||||
// Log error but continue with next extractor
|
||||
console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
||||
}
|
||||
|
||||
// If all extractors fail, return null
|
||||
console.warn('All extraction methods failed, no valid XML found in PDF');
|
||||
return null;
|
||||
// If all extractors fail, return a no XML found error
|
||||
return this.createErrorResult(
|
||||
PDFExtractError.NO_XML_FOUND,
|
||||
'All extraction methods failed, no valid XML found in PDF'
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from PDF:', error);
|
||||
return null;
|
||||
// Handle any unexpected errors
|
||||
return this.createErrorResult(
|
||||
PDFExtractError.EXTRACT_ERROR,
|
||||
`Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
/**
|
||||
* Create a PDF extract result with error information
|
||||
* @param type Error type
|
||||
* @param message Error message
|
||||
* @param originalError Original error object
|
||||
* @returns Error result
|
||||
*/
|
||||
private createErrorResult(
|
||||
type: PDFExtractError,
|
||||
message: string,
|
||||
originalError?: Error
|
||||
): PDFExtractResult {
|
||||
console.error(`PDF Extractor Error (${type}): ${message}`);
|
||||
if (originalError) {
|
||||
console.error(originalError);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
type,
|
||||
message,
|
||||
originalError
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user