From e76768da558fbec989345f09bb34709c44bbf254 Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sun, 18 Jan 2026 04:50:57 +0000 Subject: [PATCH] feat(vision): process pages separately and make Qwen3-VL vision extraction more robust; add per-page parsing, safer JSON handling, reduced token usage, and multi-query invoice extraction --- changelog.md | 11 ++++ test/test.bankstatements.qwen3vl.ts | 64 ++++++++++++++------- test/test.invoices.qwen3vl.ts | 89 ++++++++++++++--------------- 3 files changed, 96 insertions(+), 68 deletions(-) diff --git a/changelog.md b/changelog.md index 8ce055c..e12527d 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,16 @@ # Changelog +## 2026-01-18 - 1.11.0 - feat(vision) +process pages separately and make Qwen3-VL vision extraction more robust; add per-page parsing, safer JSON handling, reduced token usage, and multi-query invoice extraction + +- Bank statements: split extraction into extractTransactionsFromPage and sequentially process pages to avoid thinking-token exhaustion +- Bank statements: reduced num_predict from 8000 to 4000, send single image per request, added per-page logging and non-throwing handling for empty or non-JSON responses +- Bank statements: catch JSON.parse errors and return empty array instead of throwing +- Invoices: introduced queryField to request single values and perform multiple simple queries (reduces model thinking usage) +- Invoices: reduced num_predict for invoice queries from 4000 to 500 and parse amounts robustly (handles European formats like 1.234,56) +- Invoices: normalize currency to uppercase 3-letter code, return safe defaults (empty strings / 0) instead of nulls, and parse net/vat/total with fallbacks +- General: simplified Ollama API error messages to avoid including response body content in thrown errors + ## 2026-01-18 - 1.10.1 - fix(tests) improve Qwen3-VL invoice extraction test by switching to non-stream API, adding model availability/pull checks, simplifying response parsing, and tightening model options diff --git a/test/test.bankstatements.qwen3vl.ts b/test/test.bankstatements.qwen3vl.ts index 3a0997d..5da801d 100644 --- a/test/test.bankstatements.qwen3vl.ts +++ b/test/test.bankstatements.qwen3vl.ts @@ -53,23 +53,14 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Extract transactions using Qwen3-VL vision + * Extract transactions from a single page + * Processes one page at a time to minimize thinking tokens */ -async function extractTransactions(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); - +async function extractTransactionsFromPage(image: string, pageNum: number): Promise { const prompt = `/no_think -Extract ALL transactions from this bank statement. - -Amount format: -- "- 21,47 €" = DEBIT = -21.47 -- "+ 1.000,00 €" = CREDIT = 1000.00 -- European format: comma is decimal separator - -For each transaction: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} - -Return ONLY a JSON array, no explanation: -[{"date":"...","counterparty":"...","amount":0},...]`; +Extract transactions from this bank statement page. +Amount: "- 21,47 €" = -21.47, "+ 1.000,00 €" = 1000.00 (European format) +Return JSON array only: [{"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47},...]`; const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', @@ -79,26 +70,28 @@ Return ONLY a JSON array, no explanation: messages: [{ role: 'user', content: prompt, - images: images, + images: [image], }], stream: false, think: false, options: { - num_predict: 8000, + num_predict: 4000, temperature: 0.1, }, }), }); if (!response.ok) { - const err = await response.text(); - throw new Error(`Ollama API error: ${response.status} - ${err}`); + throw new Error(`Ollama API error: ${response.status}`); } const data = await response.json(); let content = data.message?.content || ''; - console.log(` [Vision] Got ${content.length} chars`); + if (!content) { + console.log(` [Page ${pageNum}] Empty response`); + return []; + } // Parse JSON array if (content.startsWith('```json')) content = content.slice(7); @@ -110,10 +103,37 @@ Return ONLY a JSON array, no explanation: const endIdx = content.lastIndexOf(']') + 1; if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON array found: ${content.substring(0, 300)}`); + console.log(` [Page ${pageNum}] No JSON array found`); + return []; } - return JSON.parse(content.substring(startIdx, endIdx)); + try { + const transactions = JSON.parse(content.substring(startIdx, endIdx)); + console.log(` [Page ${pageNum}] Found ${transactions.length} transactions`); + return transactions; + } catch { + console.log(` [Page ${pageNum}] JSON parse error`); + return []; + } +} + +/** + * Extract transactions using Qwen3-VL vision + * Processes each page separately to avoid thinking token exhaustion + */ +async function extractTransactions(images: string[]): Promise { + console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); + + const allTransactions: ITransaction[] = []; + + // Process pages sequentially to avoid overwhelming the model + for (let i = 0; i < images.length; i++) { + const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); + allTransactions.push(...pageTransactions); + } + + console.log(` [Vision] Total: ${allTransactions.length} transactions`); + return allTransactions; } /** diff --git a/test/test.invoices.qwen3vl.ts b/test/test.invoices.qwen3vl.ts index d39a7bd..c1552cf 100644 --- a/test/test.invoices.qwen3vl.ts +++ b/test/test.invoices.qwen3vl.ts @@ -56,26 +56,10 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Extract invoice data directly from images using Qwen3-VL Vision - * Uses /no_think to disable reasoning mode for fast, direct JSON output + * Query Qwen3-VL for a single field + * Uses simple prompts to minimize thinking tokens */ -async function extractInvoiceFromImages(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); - - // /no_think disables Qwen3's reasoning mode - crucial for getting direct output - const prompt = `/no_think -Look at this invoice and extract these fields. Reply with ONLY JSON, no explanation. - -- invoice_number -- invoice_date (format: YYYY-MM-DD) -- vendor_name -- currency (EUR, USD, or GBP) -- net_amount -- vat_amount -- total_amount - -JSON: {"invoice_number":"...","invoice_date":"YYYY-MM-DD","vendor_name":"...","currency":"EUR","net_amount":0,"vat_amount":0,"total_amount":0}`; - +async function queryField(images: string[], question: string): Promise { const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -83,51 +67,64 @@ JSON: {"invoice_number":"...","invoice_date":"YYYY-MM-DD","vendor_name":"...","c model: VISION_MODEL, messages: [{ role: 'user', - content: prompt, - images: images, // Pass all pages + content: `/no_think\n${question} Reply with just the value, nothing else.`, + images: images, }], stream: false, - think: false, // Disable thinking mode via API + think: false, options: { - num_predict: 4000, // Need enough tokens for model to finish thinking + output + num_predict: 500, temperature: 0.1, }, }), }); if (!response.ok) { - const err = await response.text(); - throw new Error(`Ollama API error: ${response.status} - ${err}`); + throw new Error(`Ollama API error: ${response.status}`); } const data = await response.json(); - let content = data.message?.content || ''; + return (data.message?.content || '').trim(); +} - console.log(` [Vision] Response (${content.length} chars): ${content.substring(0, 200)}...`); +/** + * Extract invoice data using multiple simple queries + * Each query asks for 1-2 fields to minimize thinking tokens + * (Qwen3's thinking mode uses all tokens on complex prompts) + */ +async function extractInvoiceFromImages(images: string[]): Promise { + console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL (multi-query)`); - // Parse JSON from response - if (content.startsWith('```json')) content = content.slice(7); - else if (content.startsWith('```')) content = content.slice(3); - if (content.endsWith('```')) content = content.slice(0, -3); - content = content.trim(); + // Query each field separately to avoid excessive thinking tokens + const [invoiceNum, invoiceDate, vendor, currency, amounts] = await Promise.all([ + queryField(images, 'What is the invoice number on this document?'), + queryField(images, 'What is the invoice date? Format as YYYY-MM-DD.'), + queryField(images, 'What company issued this invoice?'), + queryField(images, 'What currency is used? Answer EUR, USD, or GBP.'), + queryField(images, 'What are the net amount, VAT amount, and total amount? Format: net,vat,total'), + ]); - const startIdx = content.indexOf('{'); - const endIdx = content.lastIndexOf('}') + 1; + console.log(` [Vision] Got: ${invoiceNum} | ${invoiceDate} | ${vendor} | ${currency}`); - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON found: ${content.substring(0, 300)}`); - } - - const parsed = JSON.parse(content.substring(startIdx, endIdx)); + // Parse amounts (format: "net,vat,total" or similar) + const amountMatch = amounts.match(/([\d.,]+)/g) || []; + const parseAmount = (s: string): number => { + if (!s) return 0; + // Handle European format: 1.234,56 → 1234.56 + const normalized = s.includes(',') && s.indexOf(',') > s.lastIndexOf('.') + ? s.replace(/\./g, '').replace(',', '.') + : s.replace(/,/g, ''); + return parseFloat(normalized) || 0; + }; return { - invoice_number: parsed.invoice_number || null, - invoice_date: parsed.invoice_date || null, - vendor_name: parsed.vendor_name || null, - currency: parsed.currency || 'EUR', - net_amount: parseFloat(parsed.net_amount) || 0, - vat_amount: parseFloat(parsed.vat_amount) || 0, - total_amount: parseFloat(parsed.total_amount) || 0, + invoice_number: invoiceNum || '', + invoice_date: invoiceDate || '', + vendor_name: vendor || '', + currency: (currency || 'EUR').toUpperCase().replace(/[^A-Z]/g, '').slice(0, 3) || 'EUR', + net_amount: parseAmount(amountMatch[0] || ''), + vat_amount: parseAmount(amountMatch[1] || ''), + total_amount: parseAmount(amountMatch[2] || amountMatch[0] || ''), }; }