diff --git a/changelog.md b/changelog.md index 76fdc40..59034f6 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2025-04-04 - 4.1.6 - fix(core) +Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata. + +- Update loadPdf to capture extraction result details including detected format and improve error messaging +- Enhance TextXMLExtractor with a chunked approach using both UTF-8 and Latin-1 decoding for reliable text extraction +- Refactor PDFEmbedder to return a structured PDFEmbedResult with proper filename normalization and robust error handling +- Extend format detection logic by adding quickFormatCheck, isUBLFormat, isXRechnungFormat, isCIIFormat, isZUGFERDV1Format, and FatturaPA checks +- Introduce new validator classes (UBLValidator, XRechnungValidator, FatturaPAValidator) and a generic fallback validator in ValidatorFactory +- Update IPdf interface to include embedded XML metadata (format, filename, description) for better traceability + ## 2025-04-03 - 4.1.5 - fix(core) No uncommitted changes detected in the repository. The project files and functionality remain unchanged. diff --git a/test/output/corpus-summary.md b/test/output/corpus-summary.md index 1be8082..6276a59 100644 --- a/test/output/corpus-summary.md +++ b/test/output/corpus-summary.md @@ -1,6 +1,6 @@ # XInvoice Corpus Testing Summary -Generated on: 2025-04-03T21:33:20.326Z +Generated on: 2025-04-04T12:11:35.722Z ## Overall Summary diff --git a/test/output/test-invoice-with-xml.pdf b/test/output/test-invoice-with-xml.pdf index eb4203f..58392ac 100644 Binary files a/test/output/test-invoice-with-xml.pdf and b/test/output/test-invoice-with-xml.pdf differ diff --git a/test/output/xml-rechnung-corpus-results.json b/test/output/xml-rechnung-corpus-results.json index 1aa1ad5..0d58928 100644 --- a/test/output/xml-rechnung-corpus-results.json +++ b/test/output/xml-rechnung-corpus-results.json @@ -1,7 +1,7 @@ { "cii": { - "success": 27, - "fail": 0, + "success": 23, + "fail": 4, "details": [ { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/EN16931_1_Teilrechnung.cii.xml", @@ -137,27 +137,27 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/XRECHNUNG_Betriebskostenabrechnung.cii.xml", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung, expected: cii" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/XRECHNUNG_Einfach.cii.xml", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung, expected: cii" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/XRECHNUNG_Elektron.cii.xml", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung, expected: cii" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/XRECHNUNG_Reisekostenabrechnung.cii.xml", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung, expected: cii" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/CII/not_validating_full_invoice_based_onTest_EeISI_300_CENfullmodel.cii.xml", @@ -174,133 +174,133 @@ { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_1_Teilrechnung.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_2_Teilrechnung.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_AbweichenderZahlungsempf.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Betriebskostenabrechnung.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Einfach.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Einfach_DueDate.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Einfach_negativePaymentDue.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Elektron.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_ElektronischeAdresse.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Gutschrift.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Haftpflichtversicherung_Versicherungssteuer.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Innergemeinschaftliche_Lieferungen.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Kraftfahrversicherung_Bruttopreise.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Miete.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_OEPNV.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Physiotherapeut.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Rabatte.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_RechnungsUebertragung.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Rechnungskorrektur.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Reisekostenabrechnung.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_SEPA_Prenotification.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/EN16931_Sachversicherung_berechneter_Steuersatz.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { @@ -330,13 +330,13 @@ { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/not_validating_full_invoice_based_onTest_EeISI_300_CENfullmodel.ubl.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/XML-Rechnung/UBL/ubl-tc434-creditnote1.xml", "success": true, - "format": "xrechnung", + "format": "ubl", "error": null } ] @@ -346,5 +346,5 @@ "fail": 0, "details": [] }, - "totalSuccessRate": 1 + "totalSuccessRate": 0.9272727272727272 } \ No newline at end of file diff --git a/test/output/zugferd-corpus-results.json b/test/output/zugferd-corpus-results.json index e22aa24..0f2504c 100644 --- a/test/output/zugferd-corpus-results.json +++ b/test/output/zugferd-corpus-results.json @@ -1,13 +1,13 @@ { "zugferdV1Correct": { - "success": 18, - "fail": 3, + "success": 21, + "fail": 0, "details": [ { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv1/correct/4s4u/additional-data-sample-1.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv1/correct/Intarsys/ZUGFeRD_1p0_BASIC_Einfach.pdf", @@ -89,15 +89,15 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv1/correct/Mustangproject/MustangGnuaccountingBeispielRE-20140519_499.pdf", - "success": false, - "format": null, - "error": "Error: Unsupported invoice format: unknown" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv1/correct/Mustangproject/MustangGnuaccountingBeispielRE-20140522_501.pdf", - "success": false, - "format": null, - "error": "Error: Unsupported invoice format: unknown" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv1/correct/Mustangproject/MustangGnuaccountingBeispielRE-20140703_502.pdf", @@ -156,8 +156,8 @@ ] }, "zugferdV2Correct": { - "success": 48, - "fail": 30, + "success": 74, + "fail": 4, "details": [ { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/FNFE-factur-x-examples/Avoir_FR_type381_BASIC.pdf", @@ -221,183 +221,183 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/PHP_@gpFacturX/sample_inofficial_20190125_atgp_factur-x_v_1_0.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/BASIC/zugferd_2p0_BASIC_Einfach.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/BASIC/zugferd_2p0_BASIC_Rechnungskorrektur.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/BASIC/zugferd_2p0_BASIC_Taxifahrt.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_1_Teilrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_2_Teilrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_AbweichenderZahlungsempf.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Betriebskostenabrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Einfach.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Elektron.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_ElektronischeAdresse.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Gutschrift.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Haftpflichtversicherung_Versicherungssteuer.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Innergemeinschaftliche_Lieferungen.pdf", "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "format": "xrechnung", + "error": "Wrong format detected: xrechnung" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Kraftfahrversicherung_Bruttopreise.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Miete.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_OEPNV.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Physiotherapeut.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Rabatte.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_RechnungsUebertragung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Rechnungskorrektur.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Reisekostenabrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_SEPA_Prenotification.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EN16931/zugferd_2p0_EN16931_Sachversicherung_berechneter_Steuersatz.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EXTENDED/zugferd_2p0_EXTENDED_Fremdwaehrung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EXTENDED/zugferd_2p0_EXTENDED_InnergemeinschLieferungMehrereBestellungen.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EXTENDED/zugferd_2p0_EXTENDED_Kostenrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "facturx", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EXTENDED/zugferd_2p0_EXTENDED_Rechnungskorrektur.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/EXTENDED/zugferd_2p0_EXTENDED_Warenrechnung.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/intarsys/MINIMUM/zugferd_2p0_MINIMUM.pdf", - "success": false, - "format": null, - "error": "Error: No XML found in PDF" + "success": true, + "format": "zugferd", + "error": null }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/BASIC/zugferd_2p1_BASIC_Einfach.pdf", @@ -455,9 +455,9 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_Betriebskostenabrechnung_XRechnung_embedded.pdf", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_Einfach.pdf", @@ -485,9 +485,9 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_Elektron_XRechnung.pdf", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_Elektron_embedded.pdf", @@ -569,9 +569,9 @@ }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_Reisekostenabrechnung_XRechnung_embedded.pdf", - "success": true, - "format": "cii", - "error": null + "success": false, + "format": "xrechnung", + "error": "Wrong format detected: xrechnung" }, { "file": "/mnt/data/lossless/fin.cx/xinvoice/test/assets/corpus/ZUGFeRDv2/correct/symtrax/Beispiele/EN16931/zugferd_2p1_EN16931_SEPA_Prenotification.pdf", @@ -749,5 +749,5 @@ } ] }, - "totalCorrectSuccessRate": 0.6666666666666666 + "totalCorrectSuccessRate": 0.9595959595959596 } \ No newline at end of file diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 2f2cf27..08da188 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@fin.cx/xinvoice', - version: '4.1.5', + version: '4.1.6', description: 'A TypeScript module for creating, manipulating, and embedding XML data within PDF files specifically tailored for xinvoice packages.' } diff --git a/ts/classes.xinvoice.ts b/ts/classes.xinvoice.ts index e5a41c7..7d4308b 100644 --- a/ts/classes.xinvoice.ts +++ b/ts/classes.xinvoice.ts @@ -189,34 +189,38 @@ export class XInvoice { public async loadPdf(pdfBuffer: Uint8Array | Buffer, validate: boolean = false): Promise { try { // Extract XML from PDF using the consolidated extractor - // which tries multiple extraction methods in sequence - const xmlContent = await this.pdfExtractor.extractXml(pdfBuffer); - + const extractResult = await this.pdfExtractor.extractXml(pdfBuffer); + // Store the PDF buffer this.pdf = { name: 'invoice.pdf', id: `invoice-${Date.now()}`, metadata: { - textExtraction: '' + textExtraction: '', + format: extractResult.success ? extractResult.format?.toString() : undefined }, buffer: pdfBuffer instanceof Buffer ? new Uint8Array(pdfBuffer) : pdfBuffer }; - - if (!xmlContent) { - // No XML found in PDF - console.warn('No XML found in PDF'); - throw new Error('No XML found in PDF'); + + // Handle extraction result + if (!extractResult.success || !extractResult.xml) { + const errorMessage = extractResult.error ? extractResult.error.message : 'Unknown error extracting XML from PDF'; + console.warn('XML extraction failed:', errorMessage); + throw new Error(`No XML found in PDF: ${errorMessage}`); } - + // Load the extracted XML - await this.loadXml(xmlContent, validate); - + await this.loadXml(extractResult.xml, validate); + + // Store the detected format + this.detectedFormat = extractResult.format || InvoiceFormat.UNKNOWN; + return this; } catch (error) { console.error('Error loading PDF:', error); throw error; } - } + } /** * Copies data from a TInvoice object @@ -281,7 +285,7 @@ export class XInvoice { valid: false, errors: [{ code: 'VAL-ERROR', - message: `Validation error: ${error.message}` + message: `Validation error: ${error instanceof Error ? error.message : String(error)}` }], level }; @@ -356,7 +360,7 @@ export class XInvoice { } // Embed XML into PDF - const modifiedPdf = await this.pdfEmbedder.createPdfWithXml( + const result = await this.pdfEmbedder.createPdfWithXml( this.pdf.buffer, xmlContent, filename, @@ -365,7 +369,14 @@ export class XInvoice { this.pdf.id ); - return modifiedPdf; + // Handle potential errors + if (!result.success || !result.pdf) { + const errorMessage = result.error ? result.error.message : 'Unknown error embedding XML into PDF'; + console.error('Error exporting PDF:', errorMessage); + throw new Error(`Failed to export PDF: ${errorMessage}`); + } + + return result.pdf; } /** @@ -392,4 +403,4 @@ export class XInvoice { public isFormat(format: InvoiceFormat): boolean { return this.detectedFormat === format; } -} +} \ No newline at end of file diff --git a/ts/formats/cii/zugferd/zugferd.encoder.ts b/ts/formats/cii/zugferd/zugferd.encoder.ts index 5f14973..dd5c7da 100644 --- a/ts/formats/cii/zugferd/zugferd.encoder.ts +++ b/ts/formats/cii/zugferd/zugferd.encoder.ts @@ -2,6 +2,7 @@ import { CIIBaseEncoder } from '../cii.encoder.js'; import type { TInvoice, TCreditNote, TDebitNote } from '../../../interfaces/common.js'; import { ZUGFERD_PROFILE_IDS } from './zugferd.types.js'; import { CIIProfile } from '../cii.types.js'; +import { DOMParser, XMLSerializer } from '../../../plugins.js'; /** * Encoder for ZUGFeRD invoice format @@ -19,12 +20,17 @@ export class ZUGFeRDEncoder extends CIIBaseEncoder { * @returns ZUGFeRD XML string */ protected async encodeCreditNote(creditNote: TCreditNote): Promise { - // Create XML root - const xml = this.createXmlRoot(); + // Create base XML + const xmlDoc = this.createBaseXml(); - // For now, return a basic XML structure - // In a real implementation, we would populate the XML with credit note data - return xml; + // Set document type code to credit note (381) + this.setDocumentTypeCode(xmlDoc, '381'); + + // Add common invoice data + this.addCommonInvoiceData(xmlDoc, creditNote); + + // Serialize to string + return new XMLSerializer().serializeToString(xmlDoc); } /** @@ -33,11 +39,616 @@ export class ZUGFeRDEncoder extends CIIBaseEncoder { * @returns ZUGFeRD XML string */ protected async encodeDebitNote(debitNote: TDebitNote): Promise { - // Create XML root - const xml = this.createXmlRoot(); + // Create base XML + const xmlDoc = this.createBaseXml(); - // For now, return a basic XML structure - // In a real implementation, we would populate the XML with debit note data - return xml; + // Set document type code to invoice (380) + this.setDocumentTypeCode(xmlDoc, '380'); + + // Add common invoice data + this.addCommonInvoiceData(xmlDoc, debitNote); + + // Serialize to string + return new XMLSerializer().serializeToString(xmlDoc); } -} + + /** + * Creates a base ZUGFeRD XML document + * @returns XML document with basic structure + */ + private createBaseXml(): Document { + // Create XML document from template + const xmlString = this.createXmlRoot(); + const doc = new DOMParser().parseFromString(xmlString, 'application/xml'); + + // Add ZUGFeRD profile + this.addProfile(doc); + + return doc; + } + + /** + * Adds ZUGFeRD profile information to the XML document + * @param doc XML document + */ + private addProfile(doc: Document): void { + // Get root element + const root = doc.documentElement; + + // Create context element if it doesn't exist + let contextElement = root.getElementsByTagName('rsm:ExchangedDocumentContext')[0]; + if (!contextElement) { + contextElement = doc.createElement('rsm:ExchangedDocumentContext'); + root.appendChild(contextElement); + } + + // Create guideline parameter element + const guidelineElement = doc.createElement('ram:GuidelineSpecifiedDocumentContextParameter'); + contextElement.appendChild(guidelineElement); + + // Add ID element with profile + const idElement = doc.createElement('ram:ID'); + + // Set profile based on the selected profile + let profileId = ZUGFERD_PROFILE_IDS.BASIC; + if (this.profile === CIIProfile.COMFORT) { + profileId = ZUGFERD_PROFILE_IDS.COMFORT; + } else if (this.profile === CIIProfile.EXTENDED) { + profileId = ZUGFERD_PROFILE_IDS.EXTENDED; + } + + idElement.textContent = profileId; + guidelineElement.appendChild(idElement); + } + + /** + * Sets the document type code in the XML document + * @param doc XML document + * @param typeCode Document type code (380 for invoice, 381 for credit note) + */ + private setDocumentTypeCode(doc: Document, typeCode: string): void { + // Get root element + const root = doc.documentElement; + + // Create document element if it doesn't exist + let documentElement = root.getElementsByTagName('rsm:ExchangedDocument')[0]; + if (!documentElement) { + documentElement = doc.createElement('rsm:ExchangedDocument'); + root.appendChild(documentElement); + } + + // Add type code element + const typeCodeElement = doc.createElement('ram:TypeCode'); + typeCodeElement.textContent = typeCode; + documentElement.appendChild(typeCodeElement); + } + + /** + * Adds common invoice data to the XML document + * @param doc XML document + * @param invoice Invoice data + */ + private addCommonInvoiceData(doc: Document, invoice: TInvoice): void { + // Get root element + const root = doc.documentElement; + + // Get document element or create it + let documentElement = root.getElementsByTagName('rsm:ExchangedDocument')[0]; + if (!documentElement) { + documentElement = doc.createElement('rsm:ExchangedDocument'); + root.appendChild(documentElement); + } + + // Add ID element + const idElement = doc.createElement('ram:ID'); + idElement.textContent = invoice.id; + documentElement.appendChild(idElement); + + // Add issue date element + const issueDateElement = doc.createElement('ram:IssueDateTime'); + const dateStringElement = doc.createElement('udt:DateTimeString'); + dateStringElement.setAttribute('format', '102'); // YYYYMMDD format + dateStringElement.textContent = this.formatDateYYYYMMDD(invoice.date); + issueDateElement.appendChild(dateStringElement); + documentElement.appendChild(issueDateElement); + + // Add notes if available + if (invoice.notes && invoice.notes.length > 0) { + for (const note of invoice.notes) { + const noteElement = doc.createElement('ram:IncludedNote'); + const contentElement = doc.createElement('ram:Content'); + contentElement.textContent = note; + noteElement.appendChild(contentElement); + documentElement.appendChild(noteElement); + } + } + + // Create transaction element if it doesn't exist + let transactionElement = root.getElementsByTagName('rsm:SupplyChainTradeTransaction')[0]; + if (!transactionElement) { + transactionElement = doc.createElement('rsm:SupplyChainTradeTransaction'); + root.appendChild(transactionElement); + } + + // Add agreement section with seller and buyer + this.addAgreementSection(doc, transactionElement, invoice); + + // Add delivery section + this.addDeliverySection(doc, transactionElement, invoice); + + // Add settlement section with payment terms and totals + this.addSettlementSection(doc, transactionElement, invoice); + + // Add line items + this.addLineItems(doc, transactionElement, invoice); + } + + /** + * Adds agreement section with seller and buyer information + * @param doc XML document + * @param transactionElement Transaction element + * @param invoice Invoice data + */ + private addAgreementSection(doc: Document, transactionElement: Element, invoice: TInvoice): void { + // Create agreement element + const agreementElement = doc.createElement('ram:ApplicableHeaderTradeAgreement'); + transactionElement.appendChild(agreementElement); + + // Add buyer reference if available + if (invoice.buyerReference) { + const buyerRefElement = doc.createElement('ram:BuyerReference'); + buyerRefElement.textContent = invoice.buyerReference; + agreementElement.appendChild(buyerRefElement); + } + + // Add seller + const sellerElement = doc.createElement('ram:SellerTradeParty'); + this.addPartyInfo(doc, sellerElement, invoice.from); + + // Add seller electronic address if available + if (invoice.electronicAddress && invoice.from.type === 'company') { + const contactElement = doc.createElement('ram:DefinedTradeContact'); + const uriElement = doc.createElement('ram:URIID'); + uriElement.setAttribute('schemeID', invoice.electronicAddress.scheme); + uriElement.textContent = invoice.electronicAddress.value; + contactElement.appendChild(uriElement); + sellerElement.appendChild(contactElement); + } + + agreementElement.appendChild(sellerElement); + + // Add buyer + const buyerElement = doc.createElement('ram:BuyerTradeParty'); + this.addPartyInfo(doc, buyerElement, invoice.to); + agreementElement.appendChild(buyerElement); + } + + /** + * Adds party information to an element + * @param doc XML document + * @param partyElement Party element + * @param party Party data + */ + private addPartyInfo(doc: Document, partyElement: Element, party: any): void { + // Add name + const nameElement = doc.createElement('ram:Name'); + nameElement.textContent = party.name; + partyElement.appendChild(nameElement); + + // Add postal address + const addressElement = doc.createElement('ram:PostalTradeAddress'); + + // Add address line 1 (street) + if (party.address.streetName) { + const line1Element = doc.createElement('ram:LineOne'); + line1Element.textContent = party.address.streetName; + addressElement.appendChild(line1Element); + } + + // Add address line 2 (house number) if present + if (party.address.houseNumber && party.address.houseNumber !== '0') { + const line2Element = doc.createElement('ram:LineTwo'); + line2Element.textContent = party.address.houseNumber; + addressElement.appendChild(line2Element); + } + + // Add postal code + if (party.address.postalCode) { + const postalCodeElement = doc.createElement('ram:PostcodeCode'); + postalCodeElement.textContent = party.address.postalCode; + addressElement.appendChild(postalCodeElement); + } + + // Add city + if (party.address.city) { + const cityElement = doc.createElement('ram:CityName'); + cityElement.textContent = party.address.city; + addressElement.appendChild(cityElement); + } + + // Add country + if (party.address.country || party.address.countryCode) { + const countryElement = doc.createElement('ram:CountryID'); + countryElement.textContent = party.address.countryCode || party.address.country; + addressElement.appendChild(countryElement); + } + + partyElement.appendChild(addressElement); + + // Add VAT ID if available + if (party.registrationDetails && party.registrationDetails.vatId) { + const taxRegistrationElement = doc.createElement('ram:SpecifiedTaxRegistration'); + const taxIdElement = doc.createElement('ram:ID'); + taxIdElement.setAttribute('schemeID', 'VA'); + taxIdElement.textContent = party.registrationDetails.vatId; + taxRegistrationElement.appendChild(taxIdElement); + partyElement.appendChild(taxRegistrationElement); + } + + // Add registration ID if available + if (party.registrationDetails && party.registrationDetails.registrationId) { + const regRegistrationElement = doc.createElement('ram:SpecifiedTaxRegistration'); + const regIdElement = doc.createElement('ram:ID'); + regIdElement.setAttribute('schemeID', 'FC'); + regIdElement.textContent = party.registrationDetails.registrationId; + regRegistrationElement.appendChild(regIdElement); + partyElement.appendChild(regRegistrationElement); + } + } + + /** + * Adds delivery section with delivery information + * @param doc XML document + * @param transactionElement Transaction element + * @param invoice Invoice data + */ + private addDeliverySection(doc: Document, transactionElement: Element, invoice: TInvoice): void { + // Create delivery element + const deliveryElement = doc.createElement('ram:ApplicableHeaderTradeDelivery'); + transactionElement.appendChild(deliveryElement); + + // Add delivery date if available + if (invoice.deliveryDate) { + const deliveryDateElement = doc.createElement('ram:ActualDeliverySupplyChainEvent'); + const occurrenceDateElement = doc.createElement('ram:OccurrenceDateTime'); + const dateStringElement = doc.createElement('udt:DateTimeString'); + dateStringElement.setAttribute('format', '102'); // YYYYMMDD format + dateStringElement.textContent = this.formatDateYYYYMMDD(invoice.deliveryDate); + occurrenceDateElement.appendChild(dateStringElement); + deliveryDateElement.appendChild(occurrenceDateElement); + deliveryElement.appendChild(deliveryDateElement); + } + + // Add period of performance if available + if (invoice.periodOfPerformance) { + const periodElement = doc.createElement('ram:BillingSpecifiedPeriod'); + + // Start date + if (invoice.periodOfPerformance.from) { + const startDateElement = doc.createElement('ram:StartDateTime'); + const startDateStringElement = doc.createElement('udt:DateTimeString'); + startDateStringElement.setAttribute('format', '102'); // YYYYMMDD format + startDateStringElement.textContent = this.formatDateYYYYMMDD(invoice.periodOfPerformance.from); + startDateElement.appendChild(startDateStringElement); + periodElement.appendChild(startDateElement); + } + + // End date + if (invoice.periodOfPerformance.to) { + const endDateElement = doc.createElement('ram:EndDateTime'); + const endDateStringElement = doc.createElement('udt:DateTimeString'); + endDateStringElement.setAttribute('format', '102'); // YYYYMMDD format + endDateStringElement.textContent = this.formatDateYYYYMMDD(invoice.periodOfPerformance.to); + endDateElement.appendChild(endDateStringElement); + periodElement.appendChild(endDateElement); + } + + deliveryElement.appendChild(periodElement); + } + } + + /** + * Adds settlement section with payment terms and totals + * @param doc XML document + * @param transactionElement Transaction element + * @param invoice Invoice data + */ + private addSettlementSection(doc: Document, transactionElement: Element, invoice: TInvoice): void { + // Create settlement element + const settlementElement = doc.createElement('ram:ApplicableHeaderTradeSettlement'); + transactionElement.appendChild(settlementElement); + + // Add currency + const currencyElement = doc.createElement('ram:InvoiceCurrencyCode'); + currencyElement.textContent = invoice.currency; + settlementElement.appendChild(currencyElement); + + // Add payment terms + const paymentTermsElement = doc.createElement('ram:SpecifiedTradePaymentTerms'); + + // Add payment instructions if available + if (invoice.paymentOptions) { + // Add payment instructions as description - this is generic enough to work with any payment type + const descriptionElement = doc.createElement('ram:Description'); + descriptionElement.textContent = `Due in ${invoice.dueInDays} days. ${invoice.paymentOptions.info || ''}`; + paymentTermsElement.appendChild(descriptionElement); + } + + // Add due date + const dueDateElement = doc.createElement('ram:DueDateDateTime'); + const dateStringElement = doc.createElement('udt:DateTimeString'); + dateStringElement.setAttribute('format', '102'); // YYYYMMDD format + + // Calculate due date + const dueDate = new Date(invoice.date); + dueDate.setDate(dueDate.getDate() + invoice.dueInDays); + + dateStringElement.textContent = this.formatDateYYYYMMDD(dueDate.getTime()); + dueDateElement.appendChild(dateStringElement); + paymentTermsElement.appendChild(dueDateElement); + + settlementElement.appendChild(paymentTermsElement); + + // Add payment means if available (using a generic approach) + if (invoice.paymentOptions) { + const paymentMeansElement = doc.createElement('ram:SpecifiedTradeSettlementPaymentMeans'); + + // Payment type code (58 for SEPA transfer as default) + const typeCodeElement = doc.createElement('ram:TypeCode'); + typeCodeElement.textContent = '58'; + paymentMeansElement.appendChild(typeCodeElement); + + // Information (optional) + if (invoice.paymentOptions.info) { + const infoElement = doc.createElement('ram:Information'); + infoElement.textContent = invoice.paymentOptions.info; + paymentMeansElement.appendChild(infoElement); + } + + // If payment details are available in a standard format + if (invoice.paymentOptions.sepaConnection.iban) { + // Payee account + const payeeAccountElement = doc.createElement('ram:PayeePartyCreditorFinancialAccount'); + const ibanElement = doc.createElement('ram:IBANID'); + ibanElement.textContent = invoice.paymentOptions.sepaConnection.iban; + payeeAccountElement.appendChild(ibanElement); + paymentMeansElement.appendChild(payeeAccountElement); + + // Payee financial institution if BIC available + if (invoice.paymentOptions.sepaConnection.bic) { + const institutionElement = doc.createElement('ram:PayeeSpecifiedCreditorFinancialInstitution'); + const bicElement = doc.createElement('ram:BICID'); + bicElement.textContent = invoice.paymentOptions.sepaConnection.bic; + institutionElement.appendChild(bicElement); + paymentMeansElement.appendChild(institutionElement); + } + } + + settlementElement.appendChild(paymentMeansElement); + } + + // Add tax details + this.addTaxDetails(doc, settlementElement, invoice); + + // Add totals + this.addMonetarySummation(doc, settlementElement, invoice); + } + + /** + * Adds tax details to the settlement section + * @param doc XML document + * @param settlementElement Settlement element + * @param invoice Invoice data + */ + private addTaxDetails(doc: Document, settlementElement: Element, invoice: TInvoice): void { + // Calculate tax categories and totals + const taxCategories = new Map(); // Map of VAT rate to net amount + + // Calculate from items + if (invoice.items) { + for (const item of invoice.items) { + const itemNetAmount = item.unitNetPrice * item.unitQuantity; + const vatRate = item.vatPercentage; + + const currentAmount = taxCategories.get(vatRate) || 0; + taxCategories.set(vatRate, currentAmount + itemNetAmount); + } + } + + // Add each tax category + for (const [rate, baseAmount] of taxCategories.entries()) { + const taxElement = doc.createElement('ram:ApplicableTradeTax'); + + // Calculate tax amount + const taxAmount = baseAmount * (rate / 100); + + // Add calculated amount + const calculatedAmountElement = doc.createElement('ram:CalculatedAmount'); + calculatedAmountElement.textContent = taxAmount.toFixed(2); + taxElement.appendChild(calculatedAmountElement); + + // Add type code (VAT) + const typeCodeElement = doc.createElement('ram:TypeCode'); + typeCodeElement.textContent = 'VAT'; + taxElement.appendChild(typeCodeElement); + + // Add basis amount + const basisAmountElement = doc.createElement('ram:BasisAmount'); + basisAmountElement.textContent = baseAmount.toFixed(2); + taxElement.appendChild(basisAmountElement); + + // Add category code + const categoryCodeElement = doc.createElement('ram:CategoryCode'); + categoryCodeElement.textContent = invoice.reverseCharge ? 'AE' : 'S'; + taxElement.appendChild(categoryCodeElement); + + // Add rate + const rateElement = doc.createElement('ram:RateApplicablePercent'); + rateElement.textContent = rate.toString(); + taxElement.appendChild(rateElement); + + settlementElement.appendChild(taxElement); + } + } + + /** + * Adds monetary summation to the settlement section + * @param doc XML document + * @param settlementElement Settlement element + * @param invoice Invoice data + */ + private addMonetarySummation(doc: Document, settlementElement: Element, invoice: TInvoice): void { + const monetarySummationElement = doc.createElement('ram:SpecifiedTradeSettlementHeaderMonetarySummation'); + + // Calculate totals + let totalNetAmount = 0; + let totalTaxAmount = 0; + + // Calculate from items + if (invoice.items) { + for (const item of invoice.items) { + const itemNetAmount = item.unitNetPrice * item.unitQuantity; + const itemTaxAmount = itemNetAmount * (item.vatPercentage / 100); + + totalNetAmount += itemNetAmount; + totalTaxAmount += itemTaxAmount; + } + } + + const totalGrossAmount = totalNetAmount + totalTaxAmount; + + // Add line total amount + const lineTotalElement = doc.createElement('ram:LineTotalAmount'); + lineTotalElement.textContent = totalNetAmount.toFixed(2); + monetarySummationElement.appendChild(lineTotalElement); + + // Add tax total amount + const taxTotalElement = doc.createElement('ram:TaxTotalAmount'); + taxTotalElement.textContent = totalTaxAmount.toFixed(2); + taxTotalElement.setAttribute('currencyID', invoice.currency); + monetarySummationElement.appendChild(taxTotalElement); + + // Add grand total amount + const grandTotalElement = doc.createElement('ram:GrandTotalAmount'); + grandTotalElement.textContent = totalGrossAmount.toFixed(2); + monetarySummationElement.appendChild(grandTotalElement); + + // Add due payable amount + const duePayableElement = doc.createElement('ram:DuePayableAmount'); + duePayableElement.textContent = totalGrossAmount.toFixed(2); + monetarySummationElement.appendChild(duePayableElement); + + settlementElement.appendChild(monetarySummationElement); + } + + /** + * Adds line items to the XML document + * @param doc XML document + * @param transactionElement Transaction element + * @param invoice Invoice data + */ + private addLineItems(doc: Document, transactionElement: Element, invoice: TInvoice): void { + // Add each line item + if (invoice.items) { + for (const item of invoice.items) { + // Create line item element + const lineItemElement = doc.createElement('ram:IncludedSupplyChainTradeLineItem'); + + // Add line ID + const lineIdElement = doc.createElement('ram:AssociatedDocumentLineDocument'); + const lineIdValueElement = doc.createElement('ram:LineID'); + lineIdValueElement.textContent = item.position.toString(); + lineIdElement.appendChild(lineIdValueElement); + lineItemElement.appendChild(lineIdElement); + + // Add product information + const productElement = doc.createElement('ram:SpecifiedTradeProduct'); + + // Add name + const nameElement = doc.createElement('ram:Name'); + nameElement.textContent = item.name; + productElement.appendChild(nameElement); + + // Add article number if available + if (item.articleNumber) { + const articleNumberElement = doc.createElement('ram:SellerAssignedID'); + articleNumberElement.textContent = item.articleNumber; + productElement.appendChild(articleNumberElement); + } + + lineItemElement.appendChild(productElement); + + // Add agreement information (price) + const agreementElement = doc.createElement('ram:SpecifiedLineTradeAgreement'); + const priceElement = doc.createElement('ram:NetPriceProductTradePrice'); + const chargeAmountElement = doc.createElement('ram:ChargeAmount'); + chargeAmountElement.textContent = item.unitNetPrice.toFixed(2); + priceElement.appendChild(chargeAmountElement); + agreementElement.appendChild(priceElement); + lineItemElement.appendChild(agreementElement); + + // Add delivery information (quantity) + const deliveryElement = doc.createElement('ram:SpecifiedLineTradeDelivery'); + const quantityElement = doc.createElement('ram:BilledQuantity'); + quantityElement.textContent = item.unitQuantity.toString(); + quantityElement.setAttribute('unitCode', item.unitType); + deliveryElement.appendChild(quantityElement); + lineItemElement.appendChild(deliveryElement); + + // Add settlement information (tax) + const settlementElement = doc.createElement('ram:SpecifiedLineTradeSettlement'); + + // Add tax information + const taxElement = doc.createElement('ram:ApplicableTradeTax'); + + // Add tax type code + const taxTypeCodeElement = doc.createElement('ram:TypeCode'); + taxTypeCodeElement.textContent = 'VAT'; + taxElement.appendChild(taxTypeCodeElement); + + // Add tax category code + const taxCategoryCodeElement = doc.createElement('ram:CategoryCode'); + taxCategoryCodeElement.textContent = invoice.reverseCharge ? 'AE' : 'S'; + taxElement.appendChild(taxCategoryCodeElement); + + // Add tax rate + const taxRateElement = doc.createElement('ram:RateApplicablePercent'); + taxRateElement.textContent = item.vatPercentage.toString(); + taxElement.appendChild(taxRateElement); + + settlementElement.appendChild(taxElement); + + // Add monetary summation + const monetarySummationElement = doc.createElement('ram:SpecifiedLineTradeSettlementMonetarySummation'); + + // Calculate item total + const itemNetAmount = item.unitNetPrice * item.unitQuantity; + + // Add line total amount + const lineTotalElement = doc.createElement('ram:LineTotalAmount'); + lineTotalElement.textContent = itemNetAmount.toFixed(2); + monetarySummationElement.appendChild(lineTotalElement); + + settlementElement.appendChild(monetarySummationElement); + + lineItemElement.appendChild(settlementElement); + + // Add line item to transaction + transactionElement.appendChild(lineItemElement); + } + } + } + + /** + * Formats a date as YYYYMMDD + * @param timestamp Timestamp to format + * @returns Formatted date string + */ + private formatDateYYYYMMDD(timestamp: number): string { + const date = new Date(timestamp); + const year = date.getFullYear(); + const month = (date.getMonth() + 1).toString().padStart(2, '0'); + const day = date.getDate().toString().padStart(2, '0'); + return `${year}${month}${day}`; + } +} \ No newline at end of file diff --git a/ts/formats/factories/validator.factory.ts b/ts/formats/factories/validator.factory.ts index beaa775..1a0b433 100644 --- a/ts/formats/factories/validator.factory.ts +++ b/ts/formats/factories/validator.factory.ts @@ -1,13 +1,181 @@ import { BaseValidator } from '../base/base.validator.js'; -import { InvoiceFormat } from '../../interfaces/common.js'; +import { InvoiceFormat, ValidationLevel } from '../../interfaces/common.js'; +import type { ValidationResult } from '../../interfaces/common.js'; import { FormatDetector } from '../utils/format.detector.js'; // Import specific validators -// import { UBLValidator } from '../ubl/ubl.validator.js'; -// import { XRechnungValidator } from '../ubl/xrechnung/xrechnung.validator.js'; +import { UBLBaseValidator } from '../ubl/ubl.validator.js'; import { FacturXValidator } from '../cii/facturx/facturx.validator.js'; import { ZUGFeRDValidator } from '../cii/zugferd/zugferd.validator.js'; +/** + * UBL validator implementation + * Provides validation for standard UBL documents + */ +class UBLValidator extends UBLBaseValidator { + protected validateStructure(): boolean { + // Basic validation to check for required UBL invoice elements + if (!this.doc) return false; + + let valid = true; + + // Check for required UBL elements + const requiredElements = [ + 'cbc:ID', + 'cbc:IssueDate', + 'cac:AccountingSupplierParty', + 'cac:AccountingCustomerParty' + ]; + + for (const element of requiredElements) { + if (!this.exists(`//${element}`)) { + this.addError( + 'UBL-STRUCT-1', + `Required element ${element} is missing`, + `/${element}` + ); + valid = false; + } + } + + return valid; + } + + protected validateBusinessRules(): boolean { + // Basic business rule validation for UBL + if (!this.doc) return false; + + let valid = true; + + // Check that issue date is present and valid + const issueDateText = this.getText('//cbc:IssueDate'); + if (!issueDateText) { + this.addError( + 'UBL-BUS-1', + 'Issue date is required', + '//cbc:IssueDate' + ); + valid = false; + } else { + const issueDate = new Date(issueDateText); + if (isNaN(issueDate.getTime())) { + this.addError( + 'UBL-BUS-2', + 'Issue date is not a valid date', + '//cbc:IssueDate' + ); + valid = false; + } + } + + // Check that at least one invoice line exists + if (!this.exists('//cac:InvoiceLine') && !this.exists('//cac:CreditNoteLine')) { + this.addError( + 'UBL-BUS-3', + 'At least one invoice line or credit note line is required', + '/' + ); + valid = false; + } + + return valid; + } +} + +/** + * XRechnung validator implementation + * Extends UBL validator with additional XRechnung specific validation rules + */ +class XRechnungValidator extends UBLValidator { + protected validateStructure(): boolean { + // Call the base UBL validation first + const baseValid = super.validateStructure(); + let valid = baseValid; + + // Check for XRechnung-specific elements + if (!this.exists('//cbc:CustomizationID[contains(text(), "xrechnung")]')) { + this.addError( + 'XRECH-STRUCT-1', + 'XRechnung customization ID is missing or invalid', + '//cbc:CustomizationID' + ); + valid = false; + } + + // Check for buyer reference which is mandatory in XRechnung + if (!this.exists('//cbc:BuyerReference')) { + this.addError( + 'XRECH-STRUCT-2', + 'BuyerReference is required in XRechnung', + '//' + ); + valid = false; + } + + return valid; + } + + protected validateBusinessRules(): boolean { + // Call the base UBL business rule validation + const baseValid = super.validateBusinessRules(); + let valid = baseValid; + + // German-specific validation rules + // Check for proper VAT ID structure for German VAT IDs + const supplierVatId = this.getText('//cac:AccountingSupplierParty//cbc:CompanyID[../cac:TaxScheme/cbc:ID="VAT"]'); + if (supplierVatId && supplierVatId.startsWith('DE') && !/^DE[0-9]{9}$/.test(supplierVatId)) { + this.addError( + 'XRECH-BUS-1', + 'German VAT ID format is invalid (must be DE followed by 9 digits)', + '//cac:AccountingSupplierParty//cbc:CompanyID' + ); + valid = false; + } + + return valid; + } +} + +/** + * FatturaPA validator implementation + * Basic implementation for Italian electronic invoices + */ +class FatturaPAValidator extends BaseValidator { + validate(level: ValidationLevel = ValidationLevel.SYNTAX): ValidationResult { + // Reset errors + this.errors = []; + + let valid = true; + + if (level === ValidationLevel.SYNTAX) { + valid = this.validateSchema(); + } else if (level === ValidationLevel.SEMANTIC || level === ValidationLevel.BUSINESS) { + valid = this.validateSchema() && this.validateBusinessRules(); + } + + return { + valid, + errors: this.errors, + level + }; + } + + protected validateSchema(): boolean { + // Basic schema validation for FatturaPA + if (!this.xml.includes('', '', '', - '' + '', + '' ]; /** @@ -69,21 +74,19 @@ export abstract class BaseXMLExtractor { return false; } - // Check if it starts with XML declaration - if (!xmlString.includes(' xmlString.includes(format)); + const hasKnownFormat = this.hasKnownFormat(xmlString); if (!hasKnownFormat) { return false; } // Check if the XML string contains binary data or invalid characters - const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005']; - const hasBinaryData = invalidChars.some(char => xmlString.includes(char)); - if (hasBinaryData) { + if (this.hasBinaryData(xmlString)) { return false; } @@ -92,6 +95,11 @@ export abstract class BaseXMLExtractor { return false; } + // Check if XML has a proper structure (contains both opening and closing tags) + if (!this.hasProperXmlStructure(xmlString)) { + return false; + } + return true; } catch (error) { console.error('Error validating XML:', error); @@ -99,6 +107,85 @@ export abstract class BaseXMLExtractor { } } + /** + * Check if the XML string contains a known element + * @param xmlString XML string to check + * @returns True if the XML contains a known element + */ + protected hasKnownXmlElement(xmlString: string): boolean { + for (const format of this.knownFormats) { + // Check for opening tag of format + if (xmlString.includes(`<${format}`)) { + return true; + } + } + return false; + } + + /** + * Check if the XML string contains a known format + * @param xmlString XML string to check + * @returns True if the XML contains a known format + */ + protected hasKnownFormat(xmlString: string): boolean { + for (const format of this.knownFormats) { + if (xmlString.includes(format)) { + return true; + } + } + return false; + } + + /** + * Check if the XML string has a proper structure + * @param xmlString XML string to check + * @returns True if the XML has a proper structure + */ + protected hasProperXmlStructure(xmlString: string): boolean { + // Check for at least one matching opening and closing tag + for (const endTag of this.knownEndTags) { + const startTag = endTag.replace('/', ''); + if (xmlString.includes(startTag) && xmlString.includes(endTag)) { + return true; + } + } + + // If no specific tag is found but it has a basic XML structure + return ( + (xmlString.includes('')) || + (xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null) + ); + } + + /** + * Check if the XML string contains binary data + * @param xmlString XML string to check + * @returns True if the XML contains binary data + */ + protected hasBinaryData(xmlString: string): boolean { + // Check for common binary data indicators + const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005']; + const consecutiveNulls = '\u0000\u0000\u0000'; + + // Check for control characters that shouldn't be in XML + if (binaryChars.some(char => xmlString.includes(char))) { + return true; + } + + // Check for consecutive null bytes which indicate binary data + if (xmlString.includes(consecutiveNulls)) { + return true; + } + + // Check for high concentration of non-printable characters + const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length; + if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable + return true; + } + + return false; + } + /** * Extract XML from a string * @param text Text to extract XML from @@ -108,9 +195,22 @@ export abstract class BaseXMLExtractor { protected extractXmlFromString(text: string, startIndex: number = 0): string | null { try { // Find the start of the XML document - const xmlStartIndex = text.indexOf(']+>(?!.*<\/[^>]+>)/); + if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) { + xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length; + } else { + return null; + } } // Extract the XML content - return text.substring(xmlStartIndex, xmlEndIndex); + const xmlContent = text.substring(xmlStartIndex, xmlEndIndex); + + // Validate the extracted content + if (this.isValidXml(xmlContent)) { + return xmlContent; + } + + return null; } catch (error) { console.error('Error extracting XML from string:', error); return null; @@ -143,34 +257,99 @@ export abstract class BaseXMLExtractor { */ protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise { try { - // Try to decompress with pako - const compressedBytes = stream.getContents().buffer; + // Get the raw bytes from the stream + const rawBytes = stream.getContents(); + + // First try without decompression (in case the content is not compressed) + let xmlContent = this.tryDecodeBuffer(rawBytes); + if (xmlContent && this.isValidXml(xmlContent)) { + console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`); + return xmlContent; + } + + // Try with decompression try { - const decompressedBytes = pako.inflate(compressedBytes); - const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes); - - if (this.isValidXml(xmlContent)) { - console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`); - return xmlContent; + const decompressedBytes = this.tryDecompress(rawBytes); + if (decompressedBytes) { + xmlContent = this.tryDecodeBuffer(decompressedBytes); + if (xmlContent && this.isValidXml(xmlContent)) { + console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`); + return xmlContent; + } } } catch (decompressError) { - // Decompression failed, try without decompression - console.log(`Decompression failed for ${fileName}, trying without decompression...`); + console.log(`Decompression failed for ${fileName}: ${decompressError}`); } - - // Try without decompression - const rawBytes = stream.getContents(); - const rawContent = new TextDecoder('utf-8').decode(rawBytes); - - if (this.isValidXml(rawContent)) { - console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`); - return rawContent; - } - + return null; } catch (error) { console.error('Error extracting XML from stream:', error); return null; } } -} + + /** + * Try to decompress a buffer using different methods + * @param buffer Buffer to decompress + * @returns Decompressed buffer or null if decompression failed + */ + protected tryDecompress(buffer: Uint8Array): Uint8Array | null { + try { + // Try pako inflate (for deflate/zlib compression) + return pako.inflate(buffer); + } catch (error) { + // If pako fails, try other methods if needed + console.warn('Pako decompression failed, might be uncompressed or using a different algorithm'); + return null; + } + } + + /** + * Try to decode a buffer to a string using different encodings + * @param buffer Buffer to decode + * @returns Decoded string or null if decoding failed + */ + protected tryDecodeBuffer(buffer: Uint8Array): string | null { + try { + // Try UTF-8 first + let content = new TextDecoder('utf-8').decode(buffer); + if (this.isPlausibleXml(content)) { + return content; + } + + // Try ISO-8859-1 (Latin1) + content = this.decodeLatin1(buffer); + if (this.isPlausibleXml(content)) { + return content; + } + + return null; + } catch (error) { + console.warn('Error decoding buffer:', error); + return null; + } + } + + /** + * Decode a buffer using ISO-8859-1 (Latin1) encoding + * @param buffer Buffer to decode + * @returns Decoded string + */ + protected decodeLatin1(buffer: Uint8Array): string { + return Array.from(buffer) + .map(byte => String.fromCharCode(byte)) + .join(''); + } + + /** + * Check if a string is plausibly XML (quick check before validation) + * @param content String to check + * @returns True if the string is plausibly XML + */ + protected isPlausibleXml(content: string): boolean { + return content.includes('<') && + content.includes('>') && + (content.includes(' content.includes(format))); + } +} \ No newline at end of file diff --git a/ts/formats/pdf/extractors/text.extractor.ts b/ts/formats/pdf/extractors/text.extractor.ts index 8fd4731..b4a58b0 100644 --- a/ts/formats/pdf/extractors/text.extractor.ts +++ b/ts/formats/pdf/extractors/text.extractor.ts @@ -6,50 +6,157 @@ import { BaseXMLExtractor } from './base.extractor.js'; * Used as a fallback when other extraction methods fail */ export class TextXMLExtractor extends BaseXMLExtractor { + // Maximum chunk size to process at once (4MB) + private readonly CHUNK_SIZE = 4 * 1024 * 1024; + + // Maximum number of chunks to check (effective 20MB search limit) + private readonly MAX_CHUNKS = 5; + + // Common XML patterns to look for + private readonly XML_PATTERNS = [ + ' { try { - // Convert buffer to string and look for XML patterns - // Increase the search range to handle larger PDFs - const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000)); - - // Look for common XML patterns in the PDF - const xmlPatterns = [ - /<\?xml[^>]*\?>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i, - /]*>/i - ]; - - for (const pattern of xmlPatterns) { - const match = pdfString.match(pattern); - if (match && match.index !== undefined) { - console.log(`Found XML pattern in PDF: ${match[0]}`); - - // Try to extract the XML content - const xmlContent = this.extractXmlFromString(pdfString, match.index); - if (xmlContent && this.isValidXml(xmlContent)) { - console.log('Successfully extracted XML from PDF text'); - return xmlContent; - } - } - } - - console.warn('No valid XML found in PDF text'); - return null; + console.log('Attempting text-based XML extraction from PDF...'); + + // Convert Buffer to Uint8Array if needed + const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer; + + // Try extracting XML using the chunked approach + return this.extractXmlFromBufferChunked(buffer); } catch (error) { console.error('Error in text-based extraction:', error); return null; } } -} + + /** + * Extract XML from buffer using a chunked approach + * This helps avoid memory issues with large PDFs + * @param buffer Buffer to search in + * @returns XML content or null if not found + */ + private extractXmlFromBufferChunked(buffer: Uint8Array): string | null { + // Process the PDF in chunks + for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) { + const startPos = chunkIndex * this.CHUNK_SIZE; + if (startPos >= buffer.length) break; + + const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length); + const chunk = buffer.slice(startPos, endPos); + + // Try to extract XML from this chunk + const chunkResult = this.processChunk(chunk, startPos); + if (chunkResult) { + return chunkResult; + } + } + + console.warn('No valid XML found in any chunk of the PDF'); + return null; + } + + /** + * Process a single chunk of the PDF buffer + * @param chunk Chunk buffer to process + * @param chunkOffset Offset position of the chunk in the original buffer + * @returns XML content or null if not found + */ + private processChunk(chunk: Uint8Array, chunkOffset: number): string | null { + try { + // First try UTF-8 encoding for this chunk + const utf8String = this.decodeBufferToString(chunk, 'utf-8'); + let xmlContent = this.searchForXmlInString(utf8String); + + if (xmlContent) { + console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`); + return xmlContent; + } + + // If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better + const latin1String = this.decodeBufferToString(chunk, 'latin1'); + xmlContent = this.searchForXmlInString(latin1String); + + if (xmlContent) { + console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`); + return xmlContent; + } + + // No XML found in this chunk + return null; + } catch (error) { + console.warn(`Error processing chunk at offset ${chunkOffset}:`, error); + return null; + } + } + + /** + * Safely decode a buffer to string using the specified encoding + * @param buffer Buffer to decode + * @param encoding Encoding to use ('utf-8' or 'latin1') + * @returns Decoded string + */ + private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string { + try { + if (encoding === 'utf-8') { + return new TextDecoder('utf-8', { fatal: false }).decode(buffer); + } else { + // For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255) + // This is more reliable for binary data than TextDecoder for legacy encodings + return Array.from(buffer) + .map(byte => String.fromCharCode(byte)) + .join(''); + } + } catch (error) { + console.warn(`Error decoding buffer using ${encoding}:`, error); + // Return empty string on error to allow processing to continue + return ''; + } + } + + /** + * Search for XML patterns in a string + * @param content String to search in + * @returns XML content or null if not found + */ + private searchForXmlInString(content: string): string | null { + if (!content) return null; + + // Search for each XML pattern + for (const pattern of this.XML_PATTERNS) { + const patternIndex = content.indexOf(pattern); + if (patternIndex !== -1) { + console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`); + + // Try to extract the XML content starting from the pattern position + const xmlContent = this.extractXmlFromString(content, patternIndex); + + // Validate the extracted content + if (xmlContent && this.isValidXml(xmlContent)) { + console.log('Successfully extracted and validated XML from text'); + return xmlContent; + } + } + } + + return null; + } +} \ No newline at end of file diff --git a/ts/formats/pdf/pdf.embedder.ts b/ts/formats/pdf/pdf.embedder.ts index 8f06db2..571a34e 100644 --- a/ts/formats/pdf/pdf.embedder.ts +++ b/ts/formats/pdf/pdf.embedder.ts @@ -1,8 +1,33 @@ import { PDFDocument, AFRelationship } from '../../plugins.js'; import type { IPdf } from '../../interfaces/common.js'; +/** + * Error types for PDF embedding operations + */ +export enum PDFEmbedError { + LOAD_ERROR = 'PDF loading failed', + EMBED_ERROR = 'XML embedding failed', + SAVE_ERROR = 'PDF saving failed', + INVALID_INPUT = 'Invalid input parameters' +} + +/** + * Result of a PDF embedding operation + */ +export interface PDFEmbedResult { + success: boolean; + data?: Uint8Array; + pdf?: IPdf; + error?: { + type: PDFEmbedError; + message: string; + originalError?: Error; + }; +} + /** * Class for embedding XML into PDF files + * Provides robust error handling and support for different PDF formats */ export class PDFEmbedder { /** @@ -11,40 +36,92 @@ export class PDFEmbedder { * @param xmlContent XML content to embed * @param filename Filename for the embedded XML * @param description Description for the embedded XML - * @returns Modified PDF buffer + * @returns Result with either modified PDF buffer or error information */ public async embedXml( pdfBuffer: Uint8Array | Buffer, xmlContent: string, filename: string = 'invoice.xml', description: string = 'XML Invoice' - ): Promise { + ): Promise { try { + // Validate inputs + if (!pdfBuffer || pdfBuffer.length === 0) { + return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'PDF buffer is empty or undefined'); + } + + if (!xmlContent) { + return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'XML content is empty or undefined'); + } + + // Ensure buffer is Uint8Array + const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer; + // Load the PDF - const pdfDoc = await PDFDocument.load(pdfBuffer); + let pdfDoc: PDFDocument; + try { + pdfDoc = await PDFDocument.load(pdfBufferArray, { + ignoreEncryption: true, // Try to load encrypted PDFs + updateMetadata: false // Don't automatically update metadata + }); + } catch (error) { + return this.createErrorResult( + PDFEmbedError.LOAD_ERROR, + `Failed to load PDF: ${error instanceof Error ? error.message : String(error)}`, + error instanceof Error ? error : undefined + ); + } + + // Normalize filename (lowercase with XML extension) + filename = this.normalizeFilename(filename); // Convert the XML string to a Uint8Array const xmlBuffer = new TextEncoder().encode(xmlContent); - // Make sure filename is lowercase (as required by documentation) - filename = filename.toLowerCase(); - - // Use pdf-lib's .attach() to embed the XML - pdfDoc.attach(xmlBuffer, filename, { - mimeType: 'text/xml', - description: description, - creationDate: new Date(), - modificationDate: new Date(), - afRelationship: AFRelationship.Alternative, - }); + try { + // Use pdf-lib's .attach() to embed the XML + pdfDoc.attach(xmlBuffer, filename, { + mimeType: 'text/xml', + description: description, + creationDate: new Date(), + modificationDate: new Date(), + afRelationship: AFRelationship.Alternative, + }); + } catch (error) { + return this.createErrorResult( + PDFEmbedError.EMBED_ERROR, + `Failed to embed XML: ${error instanceof Error ? error.message : String(error)}`, + error instanceof Error ? error : undefined + ); + } // Save the modified PDF - const modifiedPdfBytes = await pdfDoc.save(); + let modifiedPdfBytes: Uint8Array; + try { + modifiedPdfBytes = await pdfDoc.save({ + addDefaultPage: false, // Don't add a page if the document is empty + useObjectStreams: false, // Better compatibility with older PDF readers + updateFieldAppearances: false // Don't update form fields + }); + } catch (error) { + return this.createErrorResult( + PDFEmbedError.SAVE_ERROR, + `Failed to save modified PDF: ${error instanceof Error ? error.message : String(error)}`, + error instanceof Error ? error : undefined + ); + } - return modifiedPdfBytes; + return { + success: true, + data: modifiedPdfBytes + }; } catch (error) { - console.error('Error embedding XML into PDF:', error); - throw error; + // Catch any uncaught errors + return this.createErrorResult( + PDFEmbedError.EMBED_ERROR, + `Unexpected error during XML embedding: ${error instanceof Error ? error.message : String(error)}`, + error instanceof Error ? error : undefined + ); } } @@ -56,7 +133,7 @@ export class PDFEmbedder { * @param description Description for the embedded XML * @param pdfName Name for the PDF * @param pdfId ID for the PDF - * @returns IPdf object with embedded XML + * @returns Result with either IPdf object or error information */ public async createPdfWithXml( pdfBuffer: Uint8Array | Buffer, @@ -65,16 +142,101 @@ export class PDFEmbedder { description: string = 'XML Invoice', pdfName: string = 'invoice.pdf', pdfId: string = `invoice-${Date.now()}` - ): Promise { - const modifiedPdfBytes = await this.embedXml(pdfBuffer, xmlContent, filename, description); + ): Promise { + // Embed XML into PDF + const embedResult = await this.embedXml(pdfBuffer, xmlContent, filename, description); + + // If embedding failed, return the error + if (!embedResult.success || !embedResult.data) { + return embedResult; + } - return { + // Create IPdf object + const pdfObject: IPdf = { name: pdfName, id: pdfId, metadata: { - textExtraction: '' + textExtraction: '', + format: this.detectPdfFormat(xmlContent), + embeddedXml: { + filename: filename, + description: description + } }, - buffer: modifiedPdfBytes + buffer: embedResult.data + }; + + return { + success: true, + pdf: pdfObject }; } -} + + /** + * Ensures the filename is normalized according to PDF/A requirements + * @param filename Filename to normalize + * @returns Normalized filename + */ + private normalizeFilename(filename: string): string { + // Convert to lowercase + let normalized = filename.toLowerCase(); + + // Ensure it has .xml extension + if (!normalized.endsWith('.xml')) { + normalized = normalized.replace(/\.[^/.]+$/, '') + '.xml'; + } + + // Replace invalid characters + normalized = normalized.replace(/[^a-z0-9_.-]/g, '_'); + + return normalized; + } + + /** + * Tries to detect the format of the XML content + * @param xmlContent XML content + * @returns Format string or undefined + */ + private detectPdfFormat(xmlContent: string): string | undefined { + if (xmlContent.includes('factur-x.eu') || xmlContent.includes('factur-x.xml')) { + return 'factur-x'; + } else if (xmlContent.includes('zugferd') || xmlContent.includes('ZUGFeRD')) { + return 'zugferd'; + } else if (xmlContent.includes('xrechnung')) { + return 'xrechnung'; + } else if (xmlContent.includes(' { + public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise { try { console.log('Starting XML extraction from PDF...'); + // Validate input + if (!pdfBuffer || pdfBuffer.length === 0) { + return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined'); + } + + // Ensure buffer is Uint8Array + const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer; + // Try each extractor in sequence for (const extractor of this.extractors) { const extractorName = extractor.constructor.name; console.log(`Trying extraction with ${extractorName}...`); - const xml = await extractor.extractXml(pdfBuffer); - if (xml) { - console.log(`Successfully extracted XML using ${extractorName}`); - return xml; + try { + const xml = await extractor.extractXml(pdfBufferArray); + + if (xml) { + console.log(`Successfully extracted XML using ${extractorName}`); + + // Detect format of the extracted XML + const format = FormatDetector.detectFormat(xml); + + return { + success: true, + xml, + format, + extractorUsed: extractorName + }; + } + + console.log(`Extraction with ${extractorName} failed, trying next method...`); + } catch (error) { + // Log error but continue with next extractor + console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`); } - - console.log(`Extraction with ${extractorName} failed, trying next method...`); } - // If all extractors fail, return null - console.warn('All extraction methods failed, no valid XML found in PDF'); - return null; + // If all extractors fail, return a no XML found error + return this.createErrorResult( + PDFExtractError.NO_XML_FOUND, + 'All extraction methods failed, no valid XML found in PDF' + ); } catch (error) { - console.error('Error extracting XML from PDF:', error); - return null; + // Handle any unexpected errors + return this.createErrorResult( + PDFExtractError.EXTRACT_ERROR, + `Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`, + error instanceof Error ? error : undefined + ); } } - - - -} + /** + * Create a PDF extract result with error information + * @param type Error type + * @param message Error message + * @param originalError Original error object + * @returns Error result + */ + private createErrorResult( + type: PDFExtractError, + message: string, + originalError?: Error + ): PDFExtractResult { + console.error(`PDF Extractor Error (${type}): ${message}`); + if (originalError) { + console.error(originalError); + } + + return { + success: false, + error: { + type, + message, + originalError + } + }; + } +} \ No newline at end of file diff --git a/ts/formats/utils/format.detector.ts b/ts/formats/utils/format.detector.ts index 4a28a60..2befe76 100644 --- a/ts/formats/utils/format.detector.ts +++ b/ts/formats/utils/format.detector.ts @@ -13,6 +13,18 @@ export class FormatDetector { */ public static detectFormat(xml: string): InvoiceFormat { try { + // Quick check for empty or invalid XML + if (!xml || typeof xml !== 'string' || xml.trim().length === 0) { + return InvoiceFormat.UNKNOWN; + } + + // Quick string-based pre-checks for performance + const quickCheck = FormatDetector.quickFormatCheck(xml); + if (quickCheck !== InvoiceFormat.UNKNOWN) { + return quickCheck; + } + + // More thorough parsing-based checks const doc = new DOMParser().parseFromString(xml, 'application/xml'); const root = doc.documentElement; @@ -21,106 +33,26 @@ export class FormatDetector { } // UBL detection (Invoice or CreditNote root element) - if (root.nodeName === 'Invoice' || root.nodeName === 'CreditNote') { - // For simplicity, we'll treat all UBL documents as XRechnung for now - // In a real implementation, we would check for specific customization IDs - return InvoiceFormat.XRECHNUNG; + if (FormatDetector.isUBLFormat(root)) { + // Check for XRechnung customization + if (FormatDetector.isXRechnungFormat(doc)) { + return InvoiceFormat.XRECHNUNG; + } + return InvoiceFormat.UBL; } - // Factur-X/ZUGFeRD detection (CrossIndustryInvoice or CrossIndustryDocument root element) - if (root.nodeName === 'rsm:CrossIndustryInvoice' || root.nodeName === 'CrossIndustryInvoice' || - root.nodeName.endsWith(':CrossIndustryInvoice')) { - // Set up namespaces for XPath queries (ZUGFeRD v2/Factur-X) - const namespaces = { - rsm: 'urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100', - ram: 'urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100' - }; - - // Create XPath selector with namespaces - const select = xpath.useNamespaces(namespaces); - - // Look for profile identifier - const profileNode = select( - 'string(//rsm:ExchangedDocumentContext/ram:GuidelineSpecifiedDocumentContextParameter/ram:ID)', - doc - ); - - if (profileNode) { - const profileText = profileNode.toString(); - - // Check for ZUGFeRD profiles - if (profileText.includes('zugferd') || - profileText === CII_PROFILE_IDS.ZUGFERD_BASIC || - profileText === CII_PROFILE_IDS.ZUGFERD_COMFORT || - profileText === CII_PROFILE_IDS.ZUGFERD_EXTENDED) { - return InvoiceFormat.ZUGFERD; - } - - // Check for Factur-X profiles - if (profileText.includes('factur-x') || - profileText === CII_PROFILE_IDS.FACTURX_MINIMUM || - profileText === CII_PROFILE_IDS.FACTURX_BASIC || - profileText === CII_PROFILE_IDS.FACTURX_EN16931) { - return InvoiceFormat.FACTURX; - } - } - - // If we can't determine the specific CII format, default to generic CII - return InvoiceFormat.CII; + // Factur-X/ZUGFeRD detection (CrossIndustryInvoice root element) + if (FormatDetector.isCIIFormat(root)) { + return FormatDetector.detectCIIFormat(doc, xml); } // ZUGFeRD v1 detection (CrossIndustryDocument root element) - if (root.nodeName === 'rsm:CrossIndustryDocument' || root.nodeName === 'CrossIndustryDocument' || - root.nodeName === 'ram:CrossIndustryDocument' || root.nodeName.endsWith(':CrossIndustryDocument')) { - - // Check for ZUGFeRD v1 namespace in the document - const xmlString = xml.toString(); - if (xmlString.includes('urn:ferd:CrossIndustryDocument:invoice:1p0') || - xmlString.includes('urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:12') || - xmlString.includes('urn:ferd:CrossIndustryDocument') || - xmlString.includes('zugferd') || - xmlString.includes('ZUGFeRD')) { - return InvoiceFormat.ZUGFERD; - } - - // Set up namespaces for XPath queries (ZUGFeRD v1) - try { - const namespaces = { - rsm: ZUGFERD_V1_NAMESPACES.RSM, - ram: ZUGFERD_V1_NAMESPACES.RAM - }; - - // Create XPath selector with namespaces - const select = xpath.useNamespaces(namespaces); - - // Look for profile identifier - const profileNode = select( - 'string(//rsm:SpecifiedExchangedDocumentContext/ram:GuidelineSpecifiedDocumentContextParameter/ram:ID)', - doc - ); - - if (profileNode) { - const profileText = profileNode.toString(); - - // Check for ZUGFeRD v1 profiles - if (profileText.includes('ferd:CrossIndustryDocument:invoice:1p0') || - profileText === CII_PROFILE_IDS.ZUGFERD_V1_BASIC || - profileText === CII_PROFILE_IDS.ZUGFERD_V1_COMFORT || - profileText === CII_PROFILE_IDS.ZUGFERD_V1_EXTENDED) { - return InvoiceFormat.ZUGFERD; - } - } - } catch (error) { - console.log('Error in ZUGFeRD v1 XPath detection:', error); - } - - // If we can't determine the specific profile but it's a CrossIndustryDocument, it's likely ZUGFeRD v1 + if (FormatDetector.isZUGFeRDV1Format(root)) { return InvoiceFormat.ZUGFERD; } - // FatturaPA detection would be implemented here - if (root.nodeName === 'FatturaElettronica' || - (root.getAttribute('xmlns') && root.getAttribute('xmlns')!.includes('fatturapa.gov.it'))) { + // FatturaPA detection + if (FormatDetector.isFatturaPAFormat(root)) { return InvoiceFormat.FATTURAPA; } @@ -130,4 +62,241 @@ export class FormatDetector { return InvoiceFormat.UNKNOWN; } } -} + + /** + * Performs a quick format check based on string content + * This is faster than full XML parsing for obvious cases + * @param xml XML string + * @returns Detected format or UNKNOWN if more analysis is needed + */ + private static quickFormatCheck(xml: string): InvoiceFormat { + const lowerXml = xml.toLowerCase(); + + // Check for obvious Factur-X indicators + if ( + lowerXml.includes('factur-x.eu') || + lowerXml.includes('factur-x.xml') || + lowerXml.includes('factur-x:') || + lowerXml.includes('urn:cen.eu:en16931:2017') && lowerXml.includes('factur-x') + ) { + return InvoiceFormat.FACTURX; + } + + // Check for obvious ZUGFeRD indicators + if ( + lowerXml.includes('zugferd:') || + lowerXml.includes('zugferd-invoice.xml') || + lowerXml.includes('urn:ferd:') || + lowerXml.includes('urn:zugferd') + ) { + return InvoiceFormat.ZUGFERD; + } + + // Check for obvious XRechnung indicators + if ( + lowerXml.includes('xrechnung') || + lowerXml.includes('urn:xoev-de:kosit:standard:xrechnung') + ) { + return InvoiceFormat.XRECHNUNG; + } + + // Check for obvious FatturaPA indicators + if ( + lowerXml.includes('fatturapa') || + lowerXml.includes('fattura elettronica') || + lowerXml.includes('fatturaelettronica') + ) { + return InvoiceFormat.FATTURAPA; + } + + // Need more analysis + return InvoiceFormat.UNKNOWN; + } + + /** + * Checks if the document is a UBL format + * @param root Root element + * @returns True if it's a UBL format + */ + private static isUBLFormat(root: Element): boolean { + return ( + root.nodeName === 'Invoice' || + root.nodeName === 'CreditNote' || + root.nodeName === 'ubl:Invoice' || + root.nodeName === 'ubl:CreditNote' || + root.nodeName.endsWith(':Invoice') || + root.nodeName.endsWith(':CreditNote') + ); + } + + /** + * Checks if the document is an XRechnung format + * @param doc XML document + * @returns True if it's an XRechnung format + */ + private static isXRechnungFormat(doc: Document): boolean { + try { + // Set up namespaces for XPath queries + const namespaces = { + 'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2', + 'ubl': 'urn:oasis:names:specification:ubl:schema:xsd:Invoice-2' + }; + + // Create XPath selector with namespaces + const select = xpath.useNamespaces(namespaces); + + // Use getElementsByTagName directly for more reliable results + const customizationNodes = doc.getElementsByTagName('cbc:CustomizationID'); + + // Check if any CustomizationID node contains "xrechnung" + for (let i = 0; i < customizationNodes.length; i++) { + const node = customizationNodes[i]; + if (node.textContent && node.textContent.includes('xrechnung')) { + return true; + } + } + + return false; + } catch (error) { + console.warn('Error checking for XRechnung format:', error); + // If direct DOM access fails, try a string-based approach + const xmlStr = new XMLSerializer().serializeToString(doc); + return xmlStr.includes('xrechnung') || xmlStr.includes('XRechnung'); + } + } + + /** + * Checks if the document is a CII format (Factur-X/ZUGFeRD v2+) + * @param root Root element + * @returns True if it's a CII format + */ + private static isCIIFormat(root: Element): boolean { + return ( + root.nodeName === 'rsm:CrossIndustryInvoice' || + root.nodeName === 'CrossIndustryInvoice' || + root.nodeName.endsWith(':CrossIndustryInvoice') + ); + } + + /** + * Checks if the document is a ZUGFeRD v1 format + * @param root Root element + * @returns True if it's a ZUGFeRD v1 format + */ + private static isZUGFeRDV1Format(root: Element): boolean { + return ( + root.nodeName === 'rsm:CrossIndustryDocument' || + root.nodeName === 'CrossIndustryDocument' || + root.nodeName === 'ram:CrossIndustryDocument' || + root.nodeName.endsWith(':CrossIndustryDocument') + ); + } + + /** + * Checks if the document is a FatturaPA format + * @param root Root element + * @returns True if it's a FatturaPA format + */ + private static isFatturaPAFormat(root: Element): boolean { + return ( + root.nodeName === 'FatturaElettronica' || + (root.getAttribute('xmlns') && root.getAttribute('xmlns')!.includes('fatturapa.gov.it')) + ); + } + + /** + * Detects the specific CII format (Factur-X vs ZUGFeRD) + * @param doc XML document + * @param xml Original XML string for fallback checks + * @returns Detected format + */ + private static detectCIIFormat(doc: Document, xml: string): InvoiceFormat { + try { + // Use direct DOM traversal instead of XPath for more reliable behavior + const contextNodes = doc.getElementsByTagNameNS( + 'urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100', + 'ExchangedDocumentContext' + ); + + if (contextNodes.length === 0) { + // Try without namespace + const noNsContextNodes = doc.getElementsByTagName('ExchangedDocumentContext'); + if (noNsContextNodes.length === 0) { + // Fallback to string-based detection + return FormatDetector.detectCIIFormatFromString(xml); + } + } + + // Loop through all potential context nodes + const allContextNodes = [...Array.from(contextNodes), ...Array.from(doc.getElementsByTagName('ExchangedDocumentContext'))]; + + for (const contextNode of allContextNodes) { + // Find guideline parameter + const guidelineNodes = contextNode.getElementsByTagName('ram:GuidelineSpecifiedDocumentContextParameter'); + + if (guidelineNodes.length === 0) { + continue; + } + + for (const guidelineNode of Array.from(guidelineNodes)) { + // Find ID element + const idNodes = guidelineNode.getElementsByTagName('ram:ID'); + + if (idNodes.length === 0) { + continue; + } + + for (const idNode of Array.from(idNodes)) { + const profileText = idNode.textContent || ''; + + // Check for ZUGFeRD profiles + if ( + profileText.includes('zugferd') || + profileText === CII_PROFILE_IDS.ZUGFERD_BASIC || + profileText === CII_PROFILE_IDS.ZUGFERD_COMFORT || + profileText === CII_PROFILE_IDS.ZUGFERD_EXTENDED + ) { + return InvoiceFormat.ZUGFERD; + } + + // Check for Factur-X profiles + if ( + profileText.includes('factur-x') || + profileText === CII_PROFILE_IDS.FACTURX_MINIMUM || + profileText === CII_PROFILE_IDS.FACTURX_BASIC || + profileText === CII_PROFILE_IDS.FACTURX_EN16931 + ) { + return InvoiceFormat.FACTURX; + } + } + } + } + + // If we reach here, fall back to string checking + return FormatDetector.detectCIIFormatFromString(xml); + } catch (error) { + console.warn('Error detecting CII format, falling back to generic CII:', error); + return FormatDetector.detectCIIFormatFromString(xml); + } + } + + /** + * Fallback method to detect CII format from string content + * @param xml XML string + * @returns Detected format + */ + private static detectCIIFormatFromString(xml: string): InvoiceFormat { + // Check for Factur-X indicators + if (xml.includes('factur-x') || xml.includes('Factur-X')) { + return InvoiceFormat.FACTURX; + } + + // Check for ZUGFeRD indicators + if (xml.includes('zugferd') || xml.includes('ZUGFeRD')) { + return InvoiceFormat.ZUGFERD; + } + + // Generic CII if we can't determine more specifically + return InvoiceFormat.CII; + } +} \ No newline at end of file diff --git a/ts/interfaces/common.ts b/ts/interfaces/common.ts index 3633045..90e3f74 100644 --- a/ts/interfaces/common.ts +++ b/ts/interfaces/common.ts @@ -72,14 +72,19 @@ export interface IPdf { id: string; metadata: { textExtraction: string; + format?: string; + embeddedXml?: { + filename: string; + description: string; + }; }; buffer: Uint8Array; } // Re-export types from tsclass for convenience -export type { TInvoice } from '@tsclass/tsclass/dist_ts/finance'; -export type { TCreditNote } from '@tsclass/tsclass/dist_ts/finance'; -export type { TDebitNote } from '@tsclass/tsclass/dist_ts/finance'; -export type { TContact } from '@tsclass/tsclass/dist_ts/business'; -export type { TLetterEnvelope } from '@tsclass/tsclass/dist_ts/business'; -export type { TDocumentEnvelope } from '@tsclass/tsclass/dist_ts/business'; +export type { TInvoice } from '@tsclass/tsclass/dist_ts/finance/index.js'; +export type { TCreditNote } from '@tsclass/tsclass/dist_ts/finance/index.js'; +export type { TDebitNote } from '@tsclass/tsclass/dist_ts/finance/index.js'; +export type { TContact } from '@tsclass/tsclass/dist_ts/business/index.js'; +export type { TLetterEnvelope } from '@tsclass/tsclass/dist_ts/business/index.js'; +export type { TDocumentEnvelope } from '@tsclass/tsclass/dist_ts/business/index.js'; \ No newline at end of file