From 2ed419f6e43cc23a79bb4ce3cdd11e007592bc08 Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Tue, 20 Jan 2026 17:14:26 +0000 Subject: [PATCH] feat(invoices): add line_items extraction and normalization for invoice parsing --- changelog.md | 9 +++++++ test/test.invoices.nanonets.ts | 46 +++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/changelog.md b/changelog.md index 1fbbd80..c2d11fa 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,14 @@ # Changelog +## 2026-01-20 - 1.16.0 - feat(invoices) +add line_items extraction and normalization for invoice parsing + +- Introduce ILineItem interface and add line_items array to IInvoice. +- Add extractLineItems helper to normalize item fields (position, product, description, quantity, unit_price, total_price). +- Include line_items in parsed invoice output and sample JSON in test, defaulting to [] when absent. +- Update logging to include extracted line item count. +- Clarify test instructions to extract items from invoice tables and skip subtotal/total rows. + ## 2026-01-20 - 1.15.3 - fix(tests(nanonets)) allow '/' when normalizing invoice strings in tests diff --git a/test/test.invoices.nanonets.ts b/test/test.invoices.nanonets.ts index 648bb29..cc104d5 100644 --- a/test/test.invoices.nanonets.ts +++ b/test/test.invoices.nanonets.ts @@ -42,6 +42,15 @@ const smartAi = new SmartAi({ // DualAgentOrchestrator for structured task execution let orchestrator: DualAgentOrchestrator; +interface ILineItem { + position: number; + product: string; + description: string; + quantity: number; + unit_price: number; + total_price: number; +} + interface IInvoice { invoice_number: string; invoice_date: string; @@ -50,6 +59,7 @@ interface IInvoice { net_amount: number; vat_amount: number; total_amount: number; + line_items: ILineItem[]; } interface IImageData { @@ -80,6 +90,7 @@ const JSON_EXTRACTION_PROMPT = `Extract key fields from the invoice. Return ONLY WHERE TO FIND DATA: - invoice_number, invoice_date, vendor_name: Look in the HEADER section at the TOP of PAGE 1 (near "Invoice no.", "Invoice date:", "Rechnungsnummer"). Use common sense. Btw. an invoice number might start on INV* . Also be sure to not omit special chars like / - and sp on. They are part of the invoice number. - net_amount, vat_amount, total_amount: Look in the SUMMARY section at the BOTTOM (look for "Total", "Amount due", "Gesamtbetrag") +- line_items: Look in the TABLE(s) with columns like Pos, Product, Description, Quantity, Unit Price, Price RULES: 1. Use common sense. @@ -89,9 +100,21 @@ RULES: 5. net_amount: Total before tax 6. vat_amount: Tax amount 7. total_amount: Final total with tax +8. line_items: Array of items from the invoice table. Skip subtotal/total rows. -JSON only: -{"invoice_number":"X","invoice_date":"YYYY-MM-DD","vendor_name":"X","currency":"EUR","net_amount":0,"vat_amount":0,"total_amount":0} +JSON format: +{ + "invoice_number": "X", + "invoice_date": "YYYY-MM-DD", + "vendor_name": "X", + "currency": "EUR", + "net_amount": 0, + "vat_amount": 0, + "total_amount": 0, + "line_items": [ + {"position": 1, "product": "X", "description": "X", "quantity": 1, "unit_price": 0, "total_price": 0} + ] +} Double check for valid JSON syntax. use the json validate tool. @@ -340,6 +363,21 @@ function extractCurrency(s: string | undefined): string { return 'EUR'; } +/** + * Extract and normalize line items array + */ +function extractLineItems(items: unknown): ILineItem[] { + if (!Array.isArray(items)) return []; + return items.map((item: Record, index: number) => ({ + position: typeof item.position === 'number' ? item.position : index + 1, + product: String(item.product || '').trim(), + description: String(item.description || '').trim(), + quantity: parseAmount(item.quantity as string | number) || 1, + unit_price: parseAmount(item.unit_price as string | number), + total_price: parseAmount(item.total_price as string | number), + })); +} + /** * Try to extract valid JSON from a response string */ @@ -448,6 +486,7 @@ ${JSON_EXTRACTION_PROMPT}`; net_amount: parseAmount(jsonData.net_amount as string | number), vat_amount: parseAmount(jsonData.vat_amount as string | number), total_amount: parseAmount(jsonData.total_amount as string | number), + line_items: extractLineItems(jsonData.line_items), }; } catch (error) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); @@ -471,6 +510,7 @@ async function extractInvoice(markdown: string, docName: string): Promise