fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.
This commit is contained in:
@ -11,7 +11,10 @@ export abstract class BaseXMLExtractor {
|
||||
'factur-x.xml',
|
||||
'zugferd-invoice.xml',
|
||||
'ZUGFeRD-invoice.xml',
|
||||
'xrechnung.xml'
|
||||
'xrechnung.xml',
|
||||
'ubl-invoice.xml',
|
||||
'invoice.xml',
|
||||
'metadata.xml'
|
||||
];
|
||||
|
||||
/**
|
||||
@ -32,7 +35,8 @@ export abstract class BaseXMLExtractor {
|
||||
'urn:zugferd',
|
||||
'urn:factur-x',
|
||||
'factur-x.eu',
|
||||
'ZUGFeRD'
|
||||
'ZUGFeRD',
|
||||
'FatturaElettronica'
|
||||
];
|
||||
|
||||
/**
|
||||
@ -47,7 +51,8 @@ export abstract class BaseXMLExtractor {
|
||||
'</rsm:CrossIndustryDocument>',
|
||||
'</ram:CrossIndustryDocument>',
|
||||
'</ubl:Invoice>',
|
||||
'</ubl:CreditNote>'
|
||||
'</ubl:CreditNote>',
|
||||
'</FatturaElettronica>'
|
||||
];
|
||||
|
||||
/**
|
||||
@ -69,21 +74,19 @@ export abstract class BaseXMLExtractor {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if it starts with XML declaration
|
||||
if (!xmlString.includes('<?xml')) {
|
||||
// Check if it starts with XML declaration or a valid element
|
||||
if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains known invoice formats
|
||||
const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
|
||||
const hasKnownFormat = this.hasKnownFormat(xmlString);
|
||||
if (!hasKnownFormat) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains binary data or invalid characters
|
||||
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
||||
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
|
||||
if (hasBinaryData) {
|
||||
if (this.hasBinaryData(xmlString)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -92,6 +95,11 @@ export abstract class BaseXMLExtractor {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if XML has a proper structure (contains both opening and closing tags)
|
||||
if (!this.hasProperXmlStructure(xmlString)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error validating XML:', error);
|
||||
@ -99,6 +107,85 @@ export abstract class BaseXMLExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the XML string contains a known element
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML contains a known element
|
||||
*/
|
||||
protected hasKnownXmlElement(xmlString: string): boolean {
|
||||
for (const format of this.knownFormats) {
|
||||
// Check for opening tag of format
|
||||
if (xmlString.includes(`<${format}`)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the XML string contains a known format
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML contains a known format
|
||||
*/
|
||||
protected hasKnownFormat(xmlString: string): boolean {
|
||||
for (const format of this.knownFormats) {
|
||||
if (xmlString.includes(format)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the XML string has a proper structure
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML has a proper structure
|
||||
*/
|
||||
protected hasProperXmlStructure(xmlString: string): boolean {
|
||||
// Check for at least one matching opening and closing tag
|
||||
for (const endTag of this.knownEndTags) {
|
||||
const startTag = endTag.replace('/', '');
|
||||
if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// If no specific tag is found but it has a basic XML structure
|
||||
return (
|
||||
(xmlString.includes('<?xml') && xmlString.includes('?>')) ||
|
||||
(xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the XML string contains binary data
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML contains binary data
|
||||
*/
|
||||
protected hasBinaryData(xmlString: string): boolean {
|
||||
// Check for common binary data indicators
|
||||
const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
||||
const consecutiveNulls = '\u0000\u0000\u0000';
|
||||
|
||||
// Check for control characters that shouldn't be in XML
|
||||
if (binaryChars.some(char => xmlString.includes(char))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for consecutive null bytes which indicate binary data
|
||||
if (xmlString.includes(consecutiveNulls)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for high concentration of non-printable characters
|
||||
const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
|
||||
if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract XML from a string
|
||||
* @param text Text to extract XML from
|
||||
@ -108,9 +195,22 @@ export abstract class BaseXMLExtractor {
|
||||
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
|
||||
try {
|
||||
// Find the start of the XML document
|
||||
const xmlStartIndex = text.indexOf('<?xml', startIndex);
|
||||
let xmlStartIndex = text.indexOf('<?xml', startIndex);
|
||||
|
||||
// If no XML declaration, try to find known elements
|
||||
if (xmlStartIndex === -1) {
|
||||
return null;
|
||||
for (const format of this.knownFormats) {
|
||||
const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
|
||||
if (formatStartIndex !== -1) {
|
||||
xmlStartIndex = formatStartIndex;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Still didn't find any start marker
|
||||
if (xmlStartIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to find the end of the XML document
|
||||
@ -123,12 +223,26 @@ export abstract class BaseXMLExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
// If no known end tag found, try to use a heuristic approach
|
||||
if (xmlEndIndex === -1) {
|
||||
return null;
|
||||
// Try to find the last closing tag
|
||||
const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
|
||||
if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
|
||||
xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the XML content
|
||||
return text.substring(xmlStartIndex, xmlEndIndex);
|
||||
const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
|
||||
|
||||
// Validate the extracted content
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from string:', error);
|
||||
return null;
|
||||
@ -143,34 +257,99 @@ export abstract class BaseXMLExtractor {
|
||||
*/
|
||||
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
|
||||
try {
|
||||
// Try to decompress with pako
|
||||
const compressedBytes = stream.getContents().buffer;
|
||||
// Get the raw bytes from the stream
|
||||
const rawBytes = stream.getContents();
|
||||
|
||||
// First try without decompression (in case the content is not compressed)
|
||||
let xmlContent = this.tryDecodeBuffer(rawBytes);
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
// Try with decompression
|
||||
try {
|
||||
const decompressedBytes = pako.inflate(compressedBytes);
|
||||
const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
|
||||
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
|
||||
return xmlContent;
|
||||
const decompressedBytes = this.tryDecompress(rawBytes);
|
||||
if (decompressedBytes) {
|
||||
xmlContent = this.tryDecodeBuffer(decompressedBytes);
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
} catch (decompressError) {
|
||||
// Decompression failed, try without decompression
|
||||
console.log(`Decompression failed for ${fileName}, trying without decompression...`);
|
||||
console.log(`Decompression failed for ${fileName}: ${decompressError}`);
|
||||
}
|
||||
|
||||
// Try without decompression
|
||||
const rawBytes = stream.getContents();
|
||||
const rawContent = new TextDecoder('utf-8').decode(rawBytes);
|
||||
|
||||
if (this.isValidXml(rawContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
|
||||
return rawContent;
|
||||
}
|
||||
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from stream:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to decompress a buffer using different methods
|
||||
* @param buffer Buffer to decompress
|
||||
* @returns Decompressed buffer or null if decompression failed
|
||||
*/
|
||||
protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
|
||||
try {
|
||||
// Try pako inflate (for deflate/zlib compression)
|
||||
return pako.inflate(buffer);
|
||||
} catch (error) {
|
||||
// If pako fails, try other methods if needed
|
||||
console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to decode a buffer to a string using different encodings
|
||||
* @param buffer Buffer to decode
|
||||
* @returns Decoded string or null if decoding failed
|
||||
*/
|
||||
protected tryDecodeBuffer(buffer: Uint8Array): string | null {
|
||||
try {
|
||||
// Try UTF-8 first
|
||||
let content = new TextDecoder('utf-8').decode(buffer);
|
||||
if (this.isPlausibleXml(content)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Try ISO-8859-1 (Latin1)
|
||||
content = this.decodeLatin1(buffer);
|
||||
if (this.isPlausibleXml(content)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.warn('Error decoding buffer:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a buffer using ISO-8859-1 (Latin1) encoding
|
||||
* @param buffer Buffer to decode
|
||||
* @returns Decoded string
|
||||
*/
|
||||
protected decodeLatin1(buffer: Uint8Array): string {
|
||||
return Array.from(buffer)
|
||||
.map(byte => String.fromCharCode(byte))
|
||||
.join('');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a string is plausibly XML (quick check before validation)
|
||||
* @param content String to check
|
||||
* @returns True if the string is plausibly XML
|
||||
*/
|
||||
protected isPlausibleXml(content: string): boolean {
|
||||
return content.includes('<') &&
|
||||
content.includes('>') &&
|
||||
(content.includes('<?xml') ||
|
||||
this.knownFormats.some(format => content.includes(format)));
|
||||
}
|
||||
}
|
@ -6,50 +6,157 @@ import { BaseXMLExtractor } from './base.extractor.js';
|
||||
* Used as a fallback when other extraction methods fail
|
||||
*/
|
||||
export class TextXMLExtractor extends BaseXMLExtractor {
|
||||
// Maximum chunk size to process at once (4MB)
|
||||
private readonly CHUNK_SIZE = 4 * 1024 * 1024;
|
||||
|
||||
// Maximum number of chunks to check (effective 20MB search limit)
|
||||
private readonly MAX_CHUNKS = 5;
|
||||
|
||||
// Common XML patterns to look for
|
||||
private readonly XML_PATTERNS = [
|
||||
'<?xml',
|
||||
'<CrossIndustryInvoice',
|
||||
'<CrossIndustryDocument',
|
||||
'<Invoice',
|
||||
'<CreditNote',
|
||||
'<rsm:CrossIndustryInvoice',
|
||||
'<rsm:CrossIndustryDocument',
|
||||
'<ram:CrossIndustryDocument',
|
||||
'<ubl:Invoice',
|
||||
'<ubl:CreditNote',
|
||||
'<FatturaElettronica'
|
||||
];
|
||||
|
||||
/**
|
||||
* Extract XML from a PDF buffer by searching for XML patterns in the text
|
||||
* Uses a chunked approach to handle large files efficiently
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
// Convert buffer to string and look for XML patterns
|
||||
// Increase the search range to handle larger PDFs
|
||||
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
|
||||
|
||||
// Look for common XML patterns in the PDF
|
||||
const xmlPatterns = [
|
||||
/<\?xml[^>]*\?>/i,
|
||||
/<CrossIndustryInvoice[^>]*>/i,
|
||||
/<CrossIndustryDocument[^>]*>/i,
|
||||
/<Invoice[^>]*>/i,
|
||||
/<CreditNote[^>]*>/i,
|
||||
/<rsm:CrossIndustryInvoice[^>]*>/i,
|
||||
/<rsm:CrossIndustryDocument[^>]*>/i,
|
||||
/<ram:CrossIndustryDocument[^>]*>/i,
|
||||
/<ubl:Invoice[^>]*>/i,
|
||||
/<ubl:CreditNote[^>]*>/i
|
||||
];
|
||||
|
||||
for (const pattern of xmlPatterns) {
|
||||
const match = pdfString.match(pattern);
|
||||
if (match && match.index !== undefined) {
|
||||
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
||||
|
||||
// Try to extract the XML content
|
||||
const xmlContent = this.extractXmlFromString(pdfString, match.index);
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log('Successfully extracted XML from PDF text');
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in PDF text');
|
||||
return null;
|
||||
console.log('Attempting text-based XML extraction from PDF...');
|
||||
|
||||
// Convert Buffer to Uint8Array if needed
|
||||
const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
||||
|
||||
// Try extracting XML using the chunked approach
|
||||
return this.extractXmlFromBufferChunked(buffer);
|
||||
} catch (error) {
|
||||
console.error('Error in text-based extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract XML from buffer using a chunked approach
|
||||
* This helps avoid memory issues with large PDFs
|
||||
* @param buffer Buffer to search in
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
|
||||
// Process the PDF in chunks
|
||||
for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
|
||||
const startPos = chunkIndex * this.CHUNK_SIZE;
|
||||
if (startPos >= buffer.length) break;
|
||||
|
||||
const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
|
||||
const chunk = buffer.slice(startPos, endPos);
|
||||
|
||||
// Try to extract XML from this chunk
|
||||
const chunkResult = this.processChunk(chunk, startPos);
|
||||
if (chunkResult) {
|
||||
return chunkResult;
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in any chunk of the PDF');
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single chunk of the PDF buffer
|
||||
* @param chunk Chunk buffer to process
|
||||
* @param chunkOffset Offset position of the chunk in the original buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
|
||||
try {
|
||||
// First try UTF-8 encoding for this chunk
|
||||
const utf8String = this.decodeBufferToString(chunk, 'utf-8');
|
||||
let xmlContent = this.searchForXmlInString(utf8String);
|
||||
|
||||
if (xmlContent) {
|
||||
console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
// If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
|
||||
const latin1String = this.decodeBufferToString(chunk, 'latin1');
|
||||
xmlContent = this.searchForXmlInString(latin1String);
|
||||
|
||||
if (xmlContent) {
|
||||
console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
// No XML found in this chunk
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely decode a buffer to string using the specified encoding
|
||||
* @param buffer Buffer to decode
|
||||
* @param encoding Encoding to use ('utf-8' or 'latin1')
|
||||
* @returns Decoded string
|
||||
*/
|
||||
private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
|
||||
try {
|
||||
if (encoding === 'utf-8') {
|
||||
return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
|
||||
} else {
|
||||
// For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
|
||||
// This is more reliable for binary data than TextDecoder for legacy encodings
|
||||
return Array.from(buffer)
|
||||
.map(byte => String.fromCharCode(byte))
|
||||
.join('');
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Error decoding buffer using ${encoding}:`, error);
|
||||
// Return empty string on error to allow processing to continue
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for XML patterns in a string
|
||||
* @param content String to search in
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private searchForXmlInString(content: string): string | null {
|
||||
if (!content) return null;
|
||||
|
||||
// Search for each XML pattern
|
||||
for (const pattern of this.XML_PATTERNS) {
|
||||
const patternIndex = content.indexOf(pattern);
|
||||
if (patternIndex !== -1) {
|
||||
console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
|
||||
|
||||
// Try to extract the XML content starting from the pattern position
|
||||
const xmlContent = this.extractXmlFromString(content, patternIndex);
|
||||
|
||||
// Validate the extracted content
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log('Successfully extracted and validated XML from text');
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -1,8 +1,33 @@
|
||||
import { PDFDocument, AFRelationship } from '../../plugins.js';
|
||||
import type { IPdf } from '../../interfaces/common.js';
|
||||
|
||||
/**
|
||||
* Error types for PDF embedding operations
|
||||
*/
|
||||
export enum PDFEmbedError {
|
||||
LOAD_ERROR = 'PDF loading failed',
|
||||
EMBED_ERROR = 'XML embedding failed',
|
||||
SAVE_ERROR = 'PDF saving failed',
|
||||
INVALID_INPUT = 'Invalid input parameters'
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a PDF embedding operation
|
||||
*/
|
||||
export interface PDFEmbedResult {
|
||||
success: boolean;
|
||||
data?: Uint8Array;
|
||||
pdf?: IPdf;
|
||||
error?: {
|
||||
type: PDFEmbedError;
|
||||
message: string;
|
||||
originalError?: Error;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Class for embedding XML into PDF files
|
||||
* Provides robust error handling and support for different PDF formats
|
||||
*/
|
||||
export class PDFEmbedder {
|
||||
/**
|
||||
@ -11,40 +36,92 @@ export class PDFEmbedder {
|
||||
* @param xmlContent XML content to embed
|
||||
* @param filename Filename for the embedded XML
|
||||
* @param description Description for the embedded XML
|
||||
* @returns Modified PDF buffer
|
||||
* @returns Result with either modified PDF buffer or error information
|
||||
*/
|
||||
public async embedXml(
|
||||
pdfBuffer: Uint8Array | Buffer,
|
||||
xmlContent: string,
|
||||
filename: string = 'invoice.xml',
|
||||
description: string = 'XML Invoice'
|
||||
): Promise<Uint8Array> {
|
||||
): Promise<PDFEmbedResult> {
|
||||
try {
|
||||
// Validate inputs
|
||||
if (!pdfBuffer || pdfBuffer.length === 0) {
|
||||
return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'PDF buffer is empty or undefined');
|
||||
}
|
||||
|
||||
if (!xmlContent) {
|
||||
return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'XML content is empty or undefined');
|
||||
}
|
||||
|
||||
// Ensure buffer is Uint8Array
|
||||
const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
||||
|
||||
// Load the PDF
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
let pdfDoc: PDFDocument;
|
||||
try {
|
||||
pdfDoc = await PDFDocument.load(pdfBufferArray, {
|
||||
ignoreEncryption: true, // Try to load encrypted PDFs
|
||||
updateMetadata: false // Don't automatically update metadata
|
||||
});
|
||||
} catch (error) {
|
||||
return this.createErrorResult(
|
||||
PDFEmbedError.LOAD_ERROR,
|
||||
`Failed to load PDF: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
|
||||
// Normalize filename (lowercase with XML extension)
|
||||
filename = this.normalizeFilename(filename);
|
||||
|
||||
// Convert the XML string to a Uint8Array
|
||||
const xmlBuffer = new TextEncoder().encode(xmlContent);
|
||||
|
||||
// Make sure filename is lowercase (as required by documentation)
|
||||
filename = filename.toLowerCase();
|
||||
|
||||
// Use pdf-lib's .attach() to embed the XML
|
||||
pdfDoc.attach(xmlBuffer, filename, {
|
||||
mimeType: 'text/xml',
|
||||
description: description,
|
||||
creationDate: new Date(),
|
||||
modificationDate: new Date(),
|
||||
afRelationship: AFRelationship.Alternative,
|
||||
});
|
||||
try {
|
||||
// Use pdf-lib's .attach() to embed the XML
|
||||
pdfDoc.attach(xmlBuffer, filename, {
|
||||
mimeType: 'text/xml',
|
||||
description: description,
|
||||
creationDate: new Date(),
|
||||
modificationDate: new Date(),
|
||||
afRelationship: AFRelationship.Alternative,
|
||||
});
|
||||
} catch (error) {
|
||||
return this.createErrorResult(
|
||||
PDFEmbedError.EMBED_ERROR,
|
||||
`Failed to embed XML: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
|
||||
// Save the modified PDF
|
||||
const modifiedPdfBytes = await pdfDoc.save();
|
||||
let modifiedPdfBytes: Uint8Array;
|
||||
try {
|
||||
modifiedPdfBytes = await pdfDoc.save({
|
||||
addDefaultPage: false, // Don't add a page if the document is empty
|
||||
useObjectStreams: false, // Better compatibility with older PDF readers
|
||||
updateFieldAppearances: false // Don't update form fields
|
||||
});
|
||||
} catch (error) {
|
||||
return this.createErrorResult(
|
||||
PDFEmbedError.SAVE_ERROR,
|
||||
`Failed to save modified PDF: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
|
||||
return modifiedPdfBytes;
|
||||
return {
|
||||
success: true,
|
||||
data: modifiedPdfBytes
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Error embedding XML into PDF:', error);
|
||||
throw error;
|
||||
// Catch any uncaught errors
|
||||
return this.createErrorResult(
|
||||
PDFEmbedError.EMBED_ERROR,
|
||||
`Unexpected error during XML embedding: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -56,7 +133,7 @@ export class PDFEmbedder {
|
||||
* @param description Description for the embedded XML
|
||||
* @param pdfName Name for the PDF
|
||||
* @param pdfId ID for the PDF
|
||||
* @returns IPdf object with embedded XML
|
||||
* @returns Result with either IPdf object or error information
|
||||
*/
|
||||
public async createPdfWithXml(
|
||||
pdfBuffer: Uint8Array | Buffer,
|
||||
@ -65,16 +142,101 @@ export class PDFEmbedder {
|
||||
description: string = 'XML Invoice',
|
||||
pdfName: string = 'invoice.pdf',
|
||||
pdfId: string = `invoice-${Date.now()}`
|
||||
): Promise<IPdf> {
|
||||
const modifiedPdfBytes = await this.embedXml(pdfBuffer, xmlContent, filename, description);
|
||||
): Promise<PDFEmbedResult> {
|
||||
// Embed XML into PDF
|
||||
const embedResult = await this.embedXml(pdfBuffer, xmlContent, filename, description);
|
||||
|
||||
// If embedding failed, return the error
|
||||
if (!embedResult.success || !embedResult.data) {
|
||||
return embedResult;
|
||||
}
|
||||
|
||||
return {
|
||||
// Create IPdf object
|
||||
const pdfObject: IPdf = {
|
||||
name: pdfName,
|
||||
id: pdfId,
|
||||
metadata: {
|
||||
textExtraction: ''
|
||||
textExtraction: '',
|
||||
format: this.detectPdfFormat(xmlContent),
|
||||
embeddedXml: {
|
||||
filename: filename,
|
||||
description: description
|
||||
}
|
||||
},
|
||||
buffer: modifiedPdfBytes
|
||||
buffer: embedResult.data
|
||||
};
|
||||
|
||||
return {
|
||||
success: true,
|
||||
pdf: pdfObject
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures the filename is normalized according to PDF/A requirements
|
||||
* @param filename Filename to normalize
|
||||
* @returns Normalized filename
|
||||
*/
|
||||
private normalizeFilename(filename: string): string {
|
||||
// Convert to lowercase
|
||||
let normalized = filename.toLowerCase();
|
||||
|
||||
// Ensure it has .xml extension
|
||||
if (!normalized.endsWith('.xml')) {
|
||||
normalized = normalized.replace(/\.[^/.]+$/, '') + '.xml';
|
||||
}
|
||||
|
||||
// Replace invalid characters
|
||||
normalized = normalized.replace(/[^a-z0-9_.-]/g, '_');
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to detect the format of the XML content
|
||||
* @param xmlContent XML content
|
||||
* @returns Format string or undefined
|
||||
*/
|
||||
private detectPdfFormat(xmlContent: string): string | undefined {
|
||||
if (xmlContent.includes('factur-x.eu') || xmlContent.includes('factur-x.xml')) {
|
||||
return 'factur-x';
|
||||
} else if (xmlContent.includes('zugferd') || xmlContent.includes('ZUGFeRD')) {
|
||||
return 'zugferd';
|
||||
} else if (xmlContent.includes('xrechnung')) {
|
||||
return 'xrechnung';
|
||||
} else if (xmlContent.includes('<Invoice') || xmlContent.includes('<CreditNote')) {
|
||||
return 'ubl';
|
||||
} else if (xmlContent.includes('FatturaElettronica')) {
|
||||
return 'fatturapa';
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an error result object
|
||||
* @param type Error type
|
||||
* @param message Error message
|
||||
* @param originalError Original error object
|
||||
* @returns Error result
|
||||
*/
|
||||
private createErrorResult(
|
||||
type: PDFEmbedError,
|
||||
message: string,
|
||||
originalError?: Error
|
||||
): PDFEmbedResult {
|
||||
console.error(`PDF Embedder Error (${type}): ${message}`);
|
||||
if (originalError) {
|
||||
console.error(originalError);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
type,
|
||||
message,
|
||||
originalError
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@ -4,6 +4,32 @@ import {
|
||||
AssociatedFilesExtractor,
|
||||
TextXMLExtractor
|
||||
} from './extractors/index.js';
|
||||
import { FormatDetector } from '../utils/format.detector.js';
|
||||
import { InvoiceFormat } from '../../interfaces/common.js';
|
||||
|
||||
/**
|
||||
* Error types for PDF extraction operations
|
||||
*/
|
||||
export enum PDFExtractError {
|
||||
EXTRACT_ERROR = 'XML extraction failed',
|
||||
INVALID_INPUT = 'Invalid input parameters',
|
||||
NO_XML_FOUND = 'No XML found in PDF'
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a PDF extraction operation
|
||||
*/
|
||||
export interface PDFExtractResult {
|
||||
success: boolean;
|
||||
xml?: string;
|
||||
format?: InvoiceFormat;
|
||||
extractorUsed?: string;
|
||||
error?: {
|
||||
type: PDFExtractError;
|
||||
message: string;
|
||||
originalError?: Error;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main PDF extractor class that orchestrates the extraction process
|
||||
@ -18,9 +44,9 @@ export class PDFExtractor {
|
||||
constructor() {
|
||||
// Add extractors in order of preference/likelihood of success
|
||||
this.extractors.push(
|
||||
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
||||
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
||||
new TextXMLExtractor() // Text-based extraction (fallback)
|
||||
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
||||
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
||||
new TextXMLExtractor() // Text-based extraction (fallback)
|
||||
);
|
||||
}
|
||||
|
||||
@ -28,36 +54,88 @@ export class PDFExtractor {
|
||||
* Extract XML from a PDF buffer
|
||||
* Tries multiple extraction methods in sequence
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
* @returns Result with either the extracted XML or error information
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
|
||||
try {
|
||||
console.log('Starting XML extraction from PDF...');
|
||||
|
||||
// Validate input
|
||||
if (!pdfBuffer || pdfBuffer.length === 0) {
|
||||
return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
|
||||
}
|
||||
|
||||
// Ensure buffer is Uint8Array
|
||||
const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
|
||||
|
||||
// Try each extractor in sequence
|
||||
for (const extractor of this.extractors) {
|
||||
const extractorName = extractor.constructor.name;
|
||||
console.log(`Trying extraction with ${extractorName}...`);
|
||||
|
||||
const xml = await extractor.extractXml(pdfBuffer);
|
||||
if (xml) {
|
||||
console.log(`Successfully extracted XML using ${extractorName}`);
|
||||
return xml;
|
||||
try {
|
||||
const xml = await extractor.extractXml(pdfBufferArray);
|
||||
|
||||
if (xml) {
|
||||
console.log(`Successfully extracted XML using ${extractorName}`);
|
||||
|
||||
// Detect format of the extracted XML
|
||||
const format = FormatDetector.detectFormat(xml);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
xml,
|
||||
format,
|
||||
extractorUsed: extractorName
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
||||
} catch (error) {
|
||||
// Log error but continue with next extractor
|
||||
console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
||||
}
|
||||
|
||||
// If all extractors fail, return null
|
||||
console.warn('All extraction methods failed, no valid XML found in PDF');
|
||||
return null;
|
||||
// If all extractors fail, return a no XML found error
|
||||
return this.createErrorResult(
|
||||
PDFExtractError.NO_XML_FOUND,
|
||||
'All extraction methods failed, no valid XML found in PDF'
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from PDF:', error);
|
||||
return null;
|
||||
// Handle any unexpected errors
|
||||
return this.createErrorResult(
|
||||
PDFExtractError.EXTRACT_ERROR,
|
||||
`Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
|
||||
error instanceof Error ? error : undefined
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
/**
|
||||
* Create a PDF extract result with error information
|
||||
* @param type Error type
|
||||
* @param message Error message
|
||||
* @param originalError Original error object
|
||||
* @returns Error result
|
||||
*/
|
||||
private createErrorResult(
|
||||
type: PDFExtractError,
|
||||
message: string,
|
||||
originalError?: Error
|
||||
): PDFExtractResult {
|
||||
console.error(`PDF Extractor Error (${type}): ${message}`);
|
||||
if (originalError) {
|
||||
console.error(originalError);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
type,
|
||||
message,
|
||||
originalError
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user