fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

This commit is contained in:
2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions

View File

@ -13,6 +13,18 @@ export class FormatDetector {
*/
public static detectFormat(xml: string): InvoiceFormat {
try {
// Quick check for empty or invalid XML
if (!xml || typeof xml !== 'string' || xml.trim().length === 0) {
return InvoiceFormat.UNKNOWN;
}
// Quick string-based pre-checks for performance
const quickCheck = FormatDetector.quickFormatCheck(xml);
if (quickCheck !== InvoiceFormat.UNKNOWN) {
return quickCheck;
}
// More thorough parsing-based checks
const doc = new DOMParser().parseFromString(xml, 'application/xml');
const root = doc.documentElement;
@ -21,106 +33,26 @@ export class FormatDetector {
}
// UBL detection (Invoice or CreditNote root element)
if (root.nodeName === 'Invoice' || root.nodeName === 'CreditNote') {
// For simplicity, we'll treat all UBL documents as XRechnung for now
// In a real implementation, we would check for specific customization IDs
return InvoiceFormat.XRECHNUNG;
if (FormatDetector.isUBLFormat(root)) {
// Check for XRechnung customization
if (FormatDetector.isXRechnungFormat(doc)) {
return InvoiceFormat.XRECHNUNG;
}
return InvoiceFormat.UBL;
}
// Factur-X/ZUGFeRD detection (CrossIndustryInvoice or CrossIndustryDocument root element)
if (root.nodeName === 'rsm:CrossIndustryInvoice' || root.nodeName === 'CrossIndustryInvoice' ||
root.nodeName.endsWith(':CrossIndustryInvoice')) {
// Set up namespaces for XPath queries (ZUGFeRD v2/Factur-X)
const namespaces = {
rsm: 'urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100',
ram: 'urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100'
};
// Create XPath selector with namespaces
const select = xpath.useNamespaces(namespaces);
// Look for profile identifier
const profileNode = select(
'string(//rsm:ExchangedDocumentContext/ram:GuidelineSpecifiedDocumentContextParameter/ram:ID)',
doc
);
if (profileNode) {
const profileText = profileNode.toString();
// Check for ZUGFeRD profiles
if (profileText.includes('zugferd') ||
profileText === CII_PROFILE_IDS.ZUGFERD_BASIC ||
profileText === CII_PROFILE_IDS.ZUGFERD_COMFORT ||
profileText === CII_PROFILE_IDS.ZUGFERD_EXTENDED) {
return InvoiceFormat.ZUGFERD;
}
// Check for Factur-X profiles
if (profileText.includes('factur-x') ||
profileText === CII_PROFILE_IDS.FACTURX_MINIMUM ||
profileText === CII_PROFILE_IDS.FACTURX_BASIC ||
profileText === CII_PROFILE_IDS.FACTURX_EN16931) {
return InvoiceFormat.FACTURX;
}
}
// If we can't determine the specific CII format, default to generic CII
return InvoiceFormat.CII;
// Factur-X/ZUGFeRD detection (CrossIndustryInvoice root element)
if (FormatDetector.isCIIFormat(root)) {
return FormatDetector.detectCIIFormat(doc, xml);
}
// ZUGFeRD v1 detection (CrossIndustryDocument root element)
if (root.nodeName === 'rsm:CrossIndustryDocument' || root.nodeName === 'CrossIndustryDocument' ||
root.nodeName === 'ram:CrossIndustryDocument' || root.nodeName.endsWith(':CrossIndustryDocument')) {
// Check for ZUGFeRD v1 namespace in the document
const xmlString = xml.toString();
if (xmlString.includes('urn:ferd:CrossIndustryDocument:invoice:1p0') ||
xmlString.includes('urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:12') ||
xmlString.includes('urn:ferd:CrossIndustryDocument') ||
xmlString.includes('zugferd') ||
xmlString.includes('ZUGFeRD')) {
return InvoiceFormat.ZUGFERD;
}
// Set up namespaces for XPath queries (ZUGFeRD v1)
try {
const namespaces = {
rsm: ZUGFERD_V1_NAMESPACES.RSM,
ram: ZUGFERD_V1_NAMESPACES.RAM
};
// Create XPath selector with namespaces
const select = xpath.useNamespaces(namespaces);
// Look for profile identifier
const profileNode = select(
'string(//rsm:SpecifiedExchangedDocumentContext/ram:GuidelineSpecifiedDocumentContextParameter/ram:ID)',
doc
);
if (profileNode) {
const profileText = profileNode.toString();
// Check for ZUGFeRD v1 profiles
if (profileText.includes('ferd:CrossIndustryDocument:invoice:1p0') ||
profileText === CII_PROFILE_IDS.ZUGFERD_V1_BASIC ||
profileText === CII_PROFILE_IDS.ZUGFERD_V1_COMFORT ||
profileText === CII_PROFILE_IDS.ZUGFERD_V1_EXTENDED) {
return InvoiceFormat.ZUGFERD;
}
}
} catch (error) {
console.log('Error in ZUGFeRD v1 XPath detection:', error);
}
// If we can't determine the specific profile but it's a CrossIndustryDocument, it's likely ZUGFeRD v1
if (FormatDetector.isZUGFeRDV1Format(root)) {
return InvoiceFormat.ZUGFERD;
}
// FatturaPA detection would be implemented here
if (root.nodeName === 'FatturaElettronica' ||
(root.getAttribute('xmlns') && root.getAttribute('xmlns')!.includes('fatturapa.gov.it'))) {
// FatturaPA detection
if (FormatDetector.isFatturaPAFormat(root)) {
return InvoiceFormat.FATTURAPA;
}
@ -130,4 +62,241 @@ export class FormatDetector {
return InvoiceFormat.UNKNOWN;
}
}
}
/**
* Performs a quick format check based on string content
* This is faster than full XML parsing for obvious cases
* @param xml XML string
* @returns Detected format or UNKNOWN if more analysis is needed
*/
private static quickFormatCheck(xml: string): InvoiceFormat {
const lowerXml = xml.toLowerCase();
// Check for obvious Factur-X indicators
if (
lowerXml.includes('factur-x.eu') ||
lowerXml.includes('factur-x.xml') ||
lowerXml.includes('factur-x:') ||
lowerXml.includes('urn:cen.eu:en16931:2017') && lowerXml.includes('factur-x')
) {
return InvoiceFormat.FACTURX;
}
// Check for obvious ZUGFeRD indicators
if (
lowerXml.includes('zugferd:') ||
lowerXml.includes('zugferd-invoice.xml') ||
lowerXml.includes('urn:ferd:') ||
lowerXml.includes('urn:zugferd')
) {
return InvoiceFormat.ZUGFERD;
}
// Check for obvious XRechnung indicators
if (
lowerXml.includes('xrechnung') ||
lowerXml.includes('urn:xoev-de:kosit:standard:xrechnung')
) {
return InvoiceFormat.XRECHNUNG;
}
// Check for obvious FatturaPA indicators
if (
lowerXml.includes('fatturapa') ||
lowerXml.includes('fattura elettronica') ||
lowerXml.includes('fatturaelettronica')
) {
return InvoiceFormat.FATTURAPA;
}
// Need more analysis
return InvoiceFormat.UNKNOWN;
}
/**
* Checks if the document is a UBL format
* @param root Root element
* @returns True if it's a UBL format
*/
private static isUBLFormat(root: Element): boolean {
return (
root.nodeName === 'Invoice' ||
root.nodeName === 'CreditNote' ||
root.nodeName === 'ubl:Invoice' ||
root.nodeName === 'ubl:CreditNote' ||
root.nodeName.endsWith(':Invoice') ||
root.nodeName.endsWith(':CreditNote')
);
}
/**
* Checks if the document is an XRechnung format
* @param doc XML document
* @returns True if it's an XRechnung format
*/
private static isXRechnungFormat(doc: Document): boolean {
try {
// Set up namespaces for XPath queries
const namespaces = {
'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2',
'ubl': 'urn:oasis:names:specification:ubl:schema:xsd:Invoice-2'
};
// Create XPath selector with namespaces
const select = xpath.useNamespaces(namespaces);
// Use getElementsByTagName directly for more reliable results
const customizationNodes = doc.getElementsByTagName('cbc:CustomizationID');
// Check if any CustomizationID node contains "xrechnung"
for (let i = 0; i < customizationNodes.length; i++) {
const node = customizationNodes[i];
if (node.textContent && node.textContent.includes('xrechnung')) {
return true;
}
}
return false;
} catch (error) {
console.warn('Error checking for XRechnung format:', error);
// If direct DOM access fails, try a string-based approach
const xmlStr = new XMLSerializer().serializeToString(doc);
return xmlStr.includes('xrechnung') || xmlStr.includes('XRechnung');
}
}
/**
* Checks if the document is a CII format (Factur-X/ZUGFeRD v2+)
* @param root Root element
* @returns True if it's a CII format
*/
private static isCIIFormat(root: Element): boolean {
return (
root.nodeName === 'rsm:CrossIndustryInvoice' ||
root.nodeName === 'CrossIndustryInvoice' ||
root.nodeName.endsWith(':CrossIndustryInvoice')
);
}
/**
* Checks if the document is a ZUGFeRD v1 format
* @param root Root element
* @returns True if it's a ZUGFeRD v1 format
*/
private static isZUGFeRDV1Format(root: Element): boolean {
return (
root.nodeName === 'rsm:CrossIndustryDocument' ||
root.nodeName === 'CrossIndustryDocument' ||
root.nodeName === 'ram:CrossIndustryDocument' ||
root.nodeName.endsWith(':CrossIndustryDocument')
);
}
/**
* Checks if the document is a FatturaPA format
* @param root Root element
* @returns True if it's a FatturaPA format
*/
private static isFatturaPAFormat(root: Element): boolean {
return (
root.nodeName === 'FatturaElettronica' ||
(root.getAttribute('xmlns') && root.getAttribute('xmlns')!.includes('fatturapa.gov.it'))
);
}
/**
* Detects the specific CII format (Factur-X vs ZUGFeRD)
* @param doc XML document
* @param xml Original XML string for fallback checks
* @returns Detected format
*/
private static detectCIIFormat(doc: Document, xml: string): InvoiceFormat {
try {
// Use direct DOM traversal instead of XPath for more reliable behavior
const contextNodes = doc.getElementsByTagNameNS(
'urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100',
'ExchangedDocumentContext'
);
if (contextNodes.length === 0) {
// Try without namespace
const noNsContextNodes = doc.getElementsByTagName('ExchangedDocumentContext');
if (noNsContextNodes.length === 0) {
// Fallback to string-based detection
return FormatDetector.detectCIIFormatFromString(xml);
}
}
// Loop through all potential context nodes
const allContextNodes = [...Array.from(contextNodes), ...Array.from(doc.getElementsByTagName('ExchangedDocumentContext'))];
for (const contextNode of allContextNodes) {
// Find guideline parameter
const guidelineNodes = contextNode.getElementsByTagName('ram:GuidelineSpecifiedDocumentContextParameter');
if (guidelineNodes.length === 0) {
continue;
}
for (const guidelineNode of Array.from(guidelineNodes)) {
// Find ID element
const idNodes = guidelineNode.getElementsByTagName('ram:ID');
if (idNodes.length === 0) {
continue;
}
for (const idNode of Array.from(idNodes)) {
const profileText = idNode.textContent || '';
// Check for ZUGFeRD profiles
if (
profileText.includes('zugferd') ||
profileText === CII_PROFILE_IDS.ZUGFERD_BASIC ||
profileText === CII_PROFILE_IDS.ZUGFERD_COMFORT ||
profileText === CII_PROFILE_IDS.ZUGFERD_EXTENDED
) {
return InvoiceFormat.ZUGFERD;
}
// Check for Factur-X profiles
if (
profileText.includes('factur-x') ||
profileText === CII_PROFILE_IDS.FACTURX_MINIMUM ||
profileText === CII_PROFILE_IDS.FACTURX_BASIC ||
profileText === CII_PROFILE_IDS.FACTURX_EN16931
) {
return InvoiceFormat.FACTURX;
}
}
}
}
// If we reach here, fall back to string checking
return FormatDetector.detectCIIFormatFromString(xml);
} catch (error) {
console.warn('Error detecting CII format, falling back to generic CII:', error);
return FormatDetector.detectCIIFormatFromString(xml);
}
}
/**
* Fallback method to detect CII format from string content
* @param xml XML string
* @returns Detected format
*/
private static detectCIIFormatFromString(xml: string): InvoiceFormat {
// Check for Factur-X indicators
if (xml.includes('factur-x') || xml.includes('Factur-X')) {
return InvoiceFormat.FACTURX;
}
// Check for ZUGFeRD indicators
if (xml.includes('zugferd') || xml.includes('ZUGFeRD')) {
return InvoiceFormat.ZUGFERD;
}
// Generic CII if we can't determine more specifically
return InvoiceFormat.CII;
}
}