2 Commits

Author SHA1 Message Date
70913c4b3e v1.16.0
Some checks failed
Docker (tags) / security (push) Successful in 28s
Docker (tags) / test (push) Failing after 7m38s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-20 17:14:26 +00:00
2ed419f6e4 feat(invoices): add line_items extraction and normalization for invoice parsing 2026-01-20 17:14:26 +00:00
3 changed files with 53 additions and 4 deletions

View File

@@ -1,5 +1,14 @@
# Changelog
## 2026-01-20 - 1.16.0 - feat(invoices)
add line_items extraction and normalization for invoice parsing
- Introduce ILineItem interface and add line_items array to IInvoice.
- Add extractLineItems helper to normalize item fields (position, product, description, quantity, unit_price, total_price).
- Include line_items in parsed invoice output and sample JSON in test, defaulting to [] when absent.
- Update logging to include extracted line item count.
- Clarify test instructions to extract items from invoice tables and skip subtotal/total rows.
## 2026-01-20 - 1.15.3 - fix(tests(nanonets))
allow '/' when normalizing invoice strings in tests

View File

@@ -1,6 +1,6 @@
{
"name": "@host.today/ht-docker-ai",
"version": "1.15.3",
"version": "1.16.0",
"type": "module",
"private": false,
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",

View File

@@ -42,6 +42,15 @@ const smartAi = new SmartAi({
// DualAgentOrchestrator for structured task execution
let orchestrator: DualAgentOrchestrator;
interface ILineItem {
position: number;
product: string;
description: string;
quantity: number;
unit_price: number;
total_price: number;
}
interface IInvoice {
invoice_number: string;
invoice_date: string;
@@ -50,6 +59,7 @@ interface IInvoice {
net_amount: number;
vat_amount: number;
total_amount: number;
line_items: ILineItem[];
}
interface IImageData {
@@ -80,6 +90,7 @@ const JSON_EXTRACTION_PROMPT = `Extract key fields from the invoice. Return ONLY
WHERE TO FIND DATA:
- invoice_number, invoice_date, vendor_name: Look in the HEADER section at the TOP of PAGE 1 (near "Invoice no.", "Invoice date:", "Rechnungsnummer"). Use common sense. Btw. an invoice number might start on INV* . Also be sure to not omit special chars like / - and sp on. They are part of the invoice number.
- net_amount, vat_amount, total_amount: Look in the SUMMARY section at the BOTTOM (look for "Total", "Amount due", "Gesamtbetrag")
- line_items: Look in the TABLE(s) with columns like Pos, Product, Description, Quantity, Unit Price, Price
RULES:
1. Use common sense.
@@ -89,9 +100,21 @@ RULES:
5. net_amount: Total before tax
6. vat_amount: Tax amount
7. total_amount: Final total with tax
8. line_items: Array of items from the invoice table. Skip subtotal/total rows.
JSON only:
{"invoice_number":"X","invoice_date":"YYYY-MM-DD","vendor_name":"X","currency":"EUR","net_amount":0,"vat_amount":0,"total_amount":0}
JSON format:
{
"invoice_number": "X",
"invoice_date": "YYYY-MM-DD",
"vendor_name": "X",
"currency": "EUR",
"net_amount": 0,
"vat_amount": 0,
"total_amount": 0,
"line_items": [
{"position": 1, "product": "X", "description": "X", "quantity": 1, "unit_price": 0, "total_price": 0}
]
}
Double check for valid JSON syntax. use the json validate tool.
@@ -340,6 +363,21 @@ function extractCurrency(s: string | undefined): string {
return 'EUR';
}
/**
* Extract and normalize line items array
*/
function extractLineItems(items: unknown): ILineItem[] {
if (!Array.isArray(items)) return [];
return items.map((item: Record<string, unknown>, index: number) => ({
position: typeof item.position === 'number' ? item.position : index + 1,
product: String(item.product || '').trim(),
description: String(item.description || '').trim(),
quantity: parseAmount(item.quantity as string | number) || 1,
unit_price: parseAmount(item.unit_price as string | number),
total_price: parseAmount(item.total_price as string | number),
}));
}
/**
* Try to extract valid JSON from a response string
*/
@@ -448,6 +486,7 @@ ${JSON_EXTRACTION_PROMPT}`;
net_amount: parseAmount(jsonData.net_amount as string | number),
vat_amount: parseAmount(jsonData.vat_amount as string | number),
total_amount: parseAmount(jsonData.total_amount as string | number),
line_items: extractLineItems(jsonData.line_items),
};
} catch (error) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
@@ -471,6 +510,7 @@ async function extractInvoice(markdown: string, docName: string): Promise<IInvoi
net_amount: 0,
vat_amount: 0,
total_amount: 0,
line_items: [],
};
}
console.log(` [${docName}] Extracted: ${invoice.invoice_number}`);
@@ -703,7 +743,7 @@ for (const tc of testCases) {
const elapsedMs = Date.now() - startTime;
processingTimes.push(elapsedMs);
console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`);
console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency} | ${extracted.line_items.length} items`);
const result = compareInvoice(extracted, expected);