feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic
This commit is contained in:
78
ts/formats/pdf/extractors/associated.extractor.ts
Normal file
78
ts/formats/pdf/extractors/associated.extractor.ts
Normal file
@ -0,0 +1,78 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import { BaseXMLExtractor } from './base.extractor.js';
|
||||
|
||||
/**
|
||||
* Associated files extractor for PDF/A-3 documents
|
||||
* Extracts XML from associated files (AF entry in the catalog)
|
||||
* Particularly useful for ZUGFeRD v1 and some Factur-X documents
|
||||
*/
|
||||
export class AssociatedFilesExtractor extends BaseXMLExtractor {
|
||||
/**
|
||||
* Extract XML from a PDF buffer using associated files
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Try to find associated files via the AF entry in the catalog
|
||||
const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
|
||||
if (!(afArray instanceof PDFArray)) {
|
||||
console.warn('No AF (Associated Files) entry found in PDF catalog');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Process each associated file
|
||||
for (let i = 0; i < afArray.size(); i++) {
|
||||
const fileSpec = afArray.lookup(i);
|
||||
if (!(fileSpec instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the file name
|
||||
const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
|
||||
if (!(fileNameObj instanceof PDFString)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const fileName = fileNameObj.decodeText();
|
||||
|
||||
// Check if it's a known invoice XML file name
|
||||
const isKnownFileName = this.knownFileNames.some(
|
||||
knownName => fileName.toLowerCase() === knownName.toLowerCase()
|
||||
);
|
||||
|
||||
// Check if it's any XML file or has invoice-related keywords
|
||||
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
|
||||
fileName.toLowerCase().includes('zugferd') ||
|
||||
fileName.toLowerCase().includes('factur-x') ||
|
||||
fileName.toLowerCase().includes('xrechnung') ||
|
||||
fileName.toLowerCase().includes('invoice');
|
||||
|
||||
if (isKnownFileName || isXmlFile) {
|
||||
// Get the embedded file dictionary
|
||||
const efDict = fileSpec.lookup(PDFName.of('EF'));
|
||||
if (!(efDict instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the file stream
|
||||
const fileStream = efDict.lookup(PDFName.of('F'));
|
||||
if (fileStream instanceof PDFRawStream) {
|
||||
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
|
||||
if (xmlContent) {
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in associated files');
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in associated files extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
177
ts/formats/pdf/extractors/base.extractor.ts
Normal file
177
ts/formats/pdf/extractors/base.extractor.ts
Normal file
@ -0,0 +1,177 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import * as pako from 'pako';
|
||||
|
||||
/**
|
||||
* Base class for PDF XML extractors with common functionality
|
||||
*/
|
||||
export abstract class BaseXMLExtractor {
|
||||
/**
|
||||
* Known XML file names for different invoice formats
|
||||
*/
|
||||
protected readonly knownFileNames = [
|
||||
'factur-x.xml',
|
||||
'zugferd-invoice.xml',
|
||||
'ZUGFeRD-invoice.xml',
|
||||
'xrechnung.xml'
|
||||
];
|
||||
|
||||
/**
|
||||
* Known XML formats to validate extracted content
|
||||
*/
|
||||
protected readonly knownFormats = [
|
||||
'CrossIndustryInvoice',
|
||||
'CrossIndustryDocument',
|
||||
'Invoice',
|
||||
'CreditNote',
|
||||
'ubl:Invoice',
|
||||
'ubl:CreditNote',
|
||||
'rsm:CrossIndustryInvoice',
|
||||
'rsm:CrossIndustryDocument',
|
||||
'ram:CrossIndustryDocument',
|
||||
'urn:un:unece:uncefact',
|
||||
'urn:ferd:CrossIndustryDocument',
|
||||
'urn:zugferd',
|
||||
'urn:factur-x',
|
||||
'factur-x.eu',
|
||||
'ZUGFeRD'
|
||||
];
|
||||
|
||||
/**
|
||||
* Known XML end tags for extracting content from strings
|
||||
*/
|
||||
protected readonly knownEndTags = [
|
||||
'</CrossIndustryInvoice>',
|
||||
'</CrossIndustryDocument>',
|
||||
'</Invoice>',
|
||||
'</CreditNote>',
|
||||
'</rsm:CrossIndustryInvoice>',
|
||||
'</rsm:CrossIndustryDocument>',
|
||||
'</ram:CrossIndustryDocument>',
|
||||
'</ubl:Invoice>',
|
||||
'</ubl:CreditNote>'
|
||||
];
|
||||
|
||||
/**
|
||||
* Extract XML from a PDF buffer
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
|
||||
|
||||
/**
|
||||
* Check if an XML string is valid
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML is valid
|
||||
*/
|
||||
protected isValidXml(xmlString: string): boolean {
|
||||
try {
|
||||
// Basic checks for XML validity
|
||||
if (!xmlString || typeof xmlString !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if it starts with XML declaration
|
||||
if (!xmlString.includes('<?xml')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains known invoice formats
|
||||
const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
|
||||
if (!hasKnownFormat) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains binary data or invalid characters
|
||||
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
||||
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
|
||||
if (hasBinaryData) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string is too short
|
||||
if (xmlString.length < 100) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error validating XML:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract XML from a string
|
||||
* @param text Text to extract XML from
|
||||
* @param startIndex Index to start extraction from
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
|
||||
try {
|
||||
// Find the start of the XML document
|
||||
const xmlStartIndex = text.indexOf('<?xml', startIndex);
|
||||
if (xmlStartIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find the end of the XML document
|
||||
let xmlEndIndex = -1;
|
||||
for (const endTag of this.knownEndTags) {
|
||||
const endIndex = text.indexOf(endTag, xmlStartIndex);
|
||||
if (endIndex !== -1) {
|
||||
xmlEndIndex = endIndex + endTag.length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (xmlEndIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract the XML content
|
||||
return text.substring(xmlStartIndex, xmlEndIndex);
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from string:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompress and decode XML content from a PDF stream
|
||||
* @param stream PDF stream containing XML data
|
||||
* @param fileName Name of the file (for logging)
|
||||
* @returns XML content or null if not valid
|
||||
*/
|
||||
protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
|
||||
try {
|
||||
// Try to decompress with pako
|
||||
const compressedBytes = stream.getContents().buffer;
|
||||
try {
|
||||
const decompressedBytes = pako.inflate(compressedBytes);
|
||||
const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
|
||||
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
} catch (decompressError) {
|
||||
// Decompression failed, try without decompression
|
||||
console.log(`Decompression failed for ${fileName}, trying without decompression...`);
|
||||
}
|
||||
|
||||
// Try without decompression
|
||||
const rawBytes = stream.getContents();
|
||||
const rawContent = new TextDecoder('utf-8').decode(rawBytes);
|
||||
|
||||
if (this.isValidXml(rawContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
|
||||
return rawContent;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from stream:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
4
ts/formats/pdf/extractors/index.ts
Normal file
4
ts/formats/pdf/extractors/index.ts
Normal file
@ -0,0 +1,4 @@
|
||||
export * from './base.extractor.js';
|
||||
export * from './standard.extractor.js';
|
||||
export * from './associated.extractor.js';
|
||||
export * from './text.extractor.js';
|
86
ts/formats/pdf/extractors/standard.extractor.ts
Normal file
86
ts/formats/pdf/extractors/standard.extractor.ts
Normal file
@ -0,0 +1,86 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import { BaseXMLExtractor } from './base.extractor.js';
|
||||
|
||||
/**
|
||||
* Standard PDF XML extractor that extracts XML from embedded files
|
||||
* Works with PDF/A-3 documents that follow the standard for embedding files
|
||||
*/
|
||||
export class StandardXMLExtractor extends BaseXMLExtractor {
|
||||
/**
|
||||
* Extract XML from a PDF buffer using standard PDF/A-3 embedded files
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Get the document's metadata dictionary
|
||||
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
||||
if (!(namesDictObj instanceof PDFDict)) {
|
||||
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get the embedded files dictionary
|
||||
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
||||
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
||||
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get the names array
|
||||
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
||||
if (!(filesSpecObj instanceof PDFArray)) {
|
||||
console.warn('No files specified in EmbeddedFiles dictionary!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find an XML file in the embedded files
|
||||
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
||||
const fileNameObj = filesSpecObj.lookup(i);
|
||||
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
||||
|
||||
if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the filename as string
|
||||
const fileName = fileNameObj.decodeText();
|
||||
|
||||
// Check if it's a known invoice XML file name
|
||||
const isKnownFileName = this.knownFileNames.some(
|
||||
knownName => fileName.toLowerCase() === knownName.toLowerCase()
|
||||
);
|
||||
|
||||
// Check if it's any XML file or has invoice-related keywords
|
||||
const isXmlFile = fileName.toLowerCase().endsWith('.xml') ||
|
||||
fileName.toLowerCase().includes('zugferd') ||
|
||||
fileName.toLowerCase().includes('factur-x') ||
|
||||
fileName.toLowerCase().includes('xrechnung') ||
|
||||
fileName.toLowerCase().includes('invoice');
|
||||
|
||||
if (isKnownFileName || isXmlFile) {
|
||||
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
||||
if (!(efDictObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const fileStream = efDictObj.lookup(PDFName.of('F'));
|
||||
if (fileStream instanceof PDFRawStream) {
|
||||
const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
|
||||
if (xmlContent) {
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in embedded files');
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in standard extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
55
ts/formats/pdf/extractors/text.extractor.ts
Normal file
55
ts/formats/pdf/extractors/text.extractor.ts
Normal file
@ -0,0 +1,55 @@
|
||||
import { BaseXMLExtractor } from './base.extractor.js';
|
||||
|
||||
/**
|
||||
* Text-based XML extractor for PDF documents
|
||||
* Extracts XML by searching for XML patterns in the PDF text
|
||||
* Used as a fallback when other extraction methods fail
|
||||
*/
|
||||
export class TextXMLExtractor extends BaseXMLExtractor {
|
||||
/**
|
||||
* Extract XML from a PDF buffer by searching for XML patterns in the text
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
// Convert buffer to string and look for XML patterns
|
||||
// Increase the search range to handle larger PDFs
|
||||
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
|
||||
|
||||
// Look for common XML patterns in the PDF
|
||||
const xmlPatterns = [
|
||||
/<\?xml[^>]*\?>/i,
|
||||
/<CrossIndustryInvoice[^>]*>/i,
|
||||
/<CrossIndustryDocument[^>]*>/i,
|
||||
/<Invoice[^>]*>/i,
|
||||
/<CreditNote[^>]*>/i,
|
||||
/<rsm:CrossIndustryInvoice[^>]*>/i,
|
||||
/<rsm:CrossIndustryDocument[^>]*>/i,
|
||||
/<ram:CrossIndustryDocument[^>]*>/i,
|
||||
/<ubl:Invoice[^>]*>/i,
|
||||
/<ubl:CreditNote[^>]*>/i
|
||||
];
|
||||
|
||||
for (const pattern of xmlPatterns) {
|
||||
const match = pdfString.match(pattern);
|
||||
if (match && match.index !== undefined) {
|
||||
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
||||
|
||||
// Try to extract the XML content
|
||||
const xmlContent = this.extractXmlFromString(pdfString, match.index);
|
||||
if (xmlContent && this.isValidXml(xmlContent)) {
|
||||
console.log('Successfully extracted XML from PDF text');
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('No valid XML found in PDF text');
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in text-based extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,30 +1,54 @@
|
||||
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
|
||||
import * as pako from 'pako';
|
||||
import {
|
||||
BaseXMLExtractor,
|
||||
StandardXMLExtractor,
|
||||
AssociatedFilesExtractor,
|
||||
TextXMLExtractor
|
||||
} from './extractors/index.js';
|
||||
|
||||
/**
|
||||
* Class for extracting XML from PDF files
|
||||
* Main PDF extractor class that orchestrates the extraction process
|
||||
* Uses multiple specialized extractors in sequence to maximize success rate
|
||||
*/
|
||||
export class PDFExtractor {
|
||||
private extractors: BaseXMLExtractor[] = [];
|
||||
|
||||
/**
|
||||
* Extracts XML from a PDF buffer
|
||||
* Constructor initializes the chain of extractors
|
||||
*/
|
||||
constructor() {
|
||||
// Add extractors in order of preference/likelihood of success
|
||||
this.extractors.push(
|
||||
new StandardXMLExtractor(), // Standard PDF/A-3 embedded files
|
||||
new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
|
||||
new TextXMLExtractor() // Text-based extraction (fallback)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract XML from a PDF buffer
|
||||
* Tries multiple extraction methods in sequence
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
// First try the standard extraction
|
||||
const standardXml = await this.standardExtraction(pdfBuffer);
|
||||
if (standardXml && this.isValidXml(standardXml)) {
|
||||
return standardXml;
|
||||
console.log('Starting XML extraction from PDF...');
|
||||
|
||||
// Try each extractor in sequence
|
||||
for (const extractor of this.extractors) {
|
||||
const extractorName = extractor.constructor.name;
|
||||
console.log(`Trying extraction with ${extractorName}...`);
|
||||
|
||||
const xml = await extractor.extractXml(pdfBuffer);
|
||||
if (xml) {
|
||||
console.log(`Successfully extracted XML using ${extractorName}`);
|
||||
return xml;
|
||||
}
|
||||
|
||||
console.log(`Extraction with ${extractorName} failed, trying next method...`);
|
||||
}
|
||||
|
||||
// If standard extraction fails, try alternative methods
|
||||
const alternativeXml = await this.alternativeExtraction(pdfBuffer);
|
||||
if (alternativeXml && this.isValidXml(alternativeXml)) {
|
||||
return alternativeXml;
|
||||
}
|
||||
|
||||
// If all extraction methods fail, return null
|
||||
// If all extractors fail, return null
|
||||
console.warn('All extraction methods failed, no valid XML found in PDF');
|
||||
return null;
|
||||
} catch (error) {
|
||||
@ -33,255 +57,7 @@ export class PDFExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Standard extraction method using PDF-lib
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
|
||||
// Get the document's metadata dictionary
|
||||
const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
|
||||
if (!(namesDictObj instanceof PDFDict)) {
|
||||
console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
|
||||
if (!(embeddedFilesDictObj instanceof PDFDict)) {
|
||||
console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
|
||||
return null;
|
||||
}
|
||||
|
||||
const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
|
||||
if (!(filesSpecObj instanceof PDFArray)) {
|
||||
console.warn('No files specified in EmbeddedFiles dictionary!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find an XML file in the embedded files
|
||||
let xmlFile: PDFRawStream | undefined;
|
||||
let xmlFileName: string | undefined;
|
||||
|
||||
for (let i = 0; i < filesSpecObj.size(); i += 2) {
|
||||
const fileNameObj = filesSpecObj.lookup(i);
|
||||
const fileSpecObj = filesSpecObj.lookup(i + 1);
|
||||
|
||||
if (!(fileNameObj instanceof PDFString)) {
|
||||
continue;
|
||||
}
|
||||
if (!(fileSpecObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the filename as string
|
||||
const fileName = fileNameObj.toString();
|
||||
|
||||
// Check if it's an XML file (checking both extension and known standard filenames)
|
||||
if (fileName.toLowerCase().includes('.xml') ||
|
||||
fileName.toLowerCase().includes('factur-x') ||
|
||||
fileName.toLowerCase().includes('zugferd') ||
|
||||
fileName.toLowerCase().includes('xrechnung')) {
|
||||
|
||||
const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
|
||||
if (!(efDictObj instanceof PDFDict)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const maybeStream = efDictObj.lookup(PDFName.of('F'));
|
||||
if (maybeStream instanceof PDFRawStream) {
|
||||
// Found an XML file - save it
|
||||
xmlFile = maybeStream;
|
||||
xmlFileName = fileName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no XML file was found, return null
|
||||
if (!xmlFile) {
|
||||
console.warn('No embedded XML file found in the PDF!');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Decompress and decode the XML content
|
||||
try {
|
||||
// Try to decompress with pako
|
||||
const xmlCompressedBytes = xmlFile.getContents().buffer;
|
||||
const xmlBytes = pako.inflate(xmlCompressedBytes);
|
||||
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
||||
|
||||
// Check if the XML content is valid
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
// If we get here, the XML content is not valid, try without decompression
|
||||
console.log('Decompression succeeded but XML is not valid, trying without decompression...');
|
||||
const rawXmlBytes = xmlFile.getContents();
|
||||
const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);
|
||||
|
||||
if (this.isValidXml(rawXmlContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
|
||||
return rawXmlContent;
|
||||
}
|
||||
|
||||
// If we get here, neither the decompressed nor the raw XML content is valid
|
||||
console.log('Neither decompressed nor raw XML content is valid');
|
||||
return null;
|
||||
} catch (decompressError) {
|
||||
// Decompression failed, try without decompression
|
||||
console.log('Decompression failed, trying without decompression...');
|
||||
try {
|
||||
const xmlBytes = xmlFile.getContents();
|
||||
const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
|
||||
|
||||
if (this.isValidXml(xmlContent)) {
|
||||
console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
|
||||
return xmlContent;
|
||||
}
|
||||
|
||||
// If we get here, the XML content is not valid
|
||||
console.log('Uncompressed XML content is not valid');
|
||||
return null;
|
||||
} catch (decodeError) {
|
||||
console.error('Error decoding XML content:', decodeError);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error in standard extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative extraction method using string search
|
||||
* @param pdfBuffer PDF buffer
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
|
||||
try {
|
||||
// Convert buffer to string and look for XML patterns
|
||||
const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));
|
||||
|
||||
// Look for common XML patterns in the PDF
|
||||
const xmlPatterns = [
|
||||
/<\?xml[^>]*\?>/i,
|
||||
/<CrossIndustryInvoice[^>]*>/i,
|
||||
/<Invoice[^>]*>/i,
|
||||
/<CreditNote[^>]*>/i,
|
||||
/<rsm:CrossIndustryInvoice[^>]*>/i
|
||||
];
|
||||
|
||||
for (const pattern of xmlPatterns) {
|
||||
const match = pdfString.match(pattern);
|
||||
if (match) {
|
||||
console.log(`Found XML pattern in PDF: ${match[0]}`);
|
||||
|
||||
// Try to extract the XML content
|
||||
const xmlContent = this.extractXmlFromString(pdfString);
|
||||
if (xmlContent) {
|
||||
console.log('Successfully extracted XML from PDF string');
|
||||
return xmlContent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error('Error in alternative extraction:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts XML from a string
|
||||
* @param pdfString PDF string
|
||||
* @returns XML content or null if not found
|
||||
*/
|
||||
private extractXmlFromString(pdfString: string): string | null {
|
||||
try {
|
||||
// Look for XML start and end tags
|
||||
const xmlStartIndex = pdfString.indexOf('<?xml');
|
||||
if (xmlStartIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try to find the end of the XML document
|
||||
const possibleEndTags = [
|
||||
'</CrossIndustryInvoice>',
|
||||
'</Invoice>',
|
||||
'</CreditNote>',
|
||||
'</rsm:CrossIndustryInvoice>'
|
||||
];
|
||||
|
||||
let xmlEndIndex = -1;
|
||||
for (const endTag of possibleEndTags) {
|
||||
const endIndex = pdfString.indexOf(endTag);
|
||||
if (endIndex !== -1) {
|
||||
xmlEndIndex = endIndex + endTag.length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (xmlEndIndex === -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract the XML content
|
||||
return pdfString.substring(xmlStartIndex, xmlEndIndex);
|
||||
} catch (error) {
|
||||
console.error('Error extracting XML from string:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an XML string is valid
|
||||
* @param xmlString XML string to check
|
||||
* @returns True if the XML is valid
|
||||
*/
|
||||
private isValidXml(xmlString: string): boolean {
|
||||
try {
|
||||
// Check if the XML string contains basic XML structure
|
||||
if (!xmlString.includes('<?xml')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains known invoice formats
|
||||
const knownFormats = [
|
||||
'CrossIndustryInvoice',
|
||||
'Invoice',
|
||||
'CreditNote',
|
||||
'ubl:Invoice',
|
||||
'ubl:CreditNote'
|
||||
];
|
||||
|
||||
const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
|
||||
if (!hasKnownFormat) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string contains binary data or invalid characters
|
||||
const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
|
||||
const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
|
||||
if (hasBinaryData) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the XML string is too short
|
||||
if (xmlString.length < 100) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error validating XML:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user