From 30c73b24c18601ef9639a0740085ea3181c79520 Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sat, 17 Jan 2026 21:50:09 +0000 Subject: [PATCH] feat(tests): use Qwen2.5 (Ollama) for invoice extraction tests and add helpers for model management; normalize dates and coerce numeric fields --- changelog.md | 10 +++ test/helpers/docker.ts | 63 +++++++++++++++ test/test.invoices.paddleocr-vl.ts | 126 +++++++++++++++++++++-------- 3 files changed, 165 insertions(+), 34 deletions(-) diff --git a/changelog.md b/changelog.md index 2c9229c..f95ac50 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-17 - 1.7.0 - feat(tests) +use Qwen2.5 (Ollama) for invoice extraction tests and add helpers for model management; normalize dates and coerce numeric fields + +- Added ensureOllamaModel and ensureQwen25 test helpers to pull/check Ollama models via localhost:11434 +- Updated invoices test to use qwen2.5:7b instead of MiniCPM and removed image payload from the text-only extraction step +- Increased Markdown truncate limit from 8000 to 12000 and reduced model num_predict from 2048 to 512 +- Rewrote extraction prompt to require strict JSON output and added post-processing to parse/convert numeric fields +- Added normalizeDate and improved compareInvoice to normalize dates and handle numeric formatting/tolerance +- Updated test setup to ensure Qwen2.5 is available and adjusted logging/messages to reflect the Qwen2.5-based workflow + ## 2026-01-17 - 1.6.0 - feat(paddleocr-vl) add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers diff --git a/test/helpers/docker.ts b/test/helpers/docker.ts index fc67fb2..7715340 100644 --- a/test/helpers/docker.ts +++ b/test/helpers/docker.ts @@ -295,3 +295,66 @@ export async function ensurePaddleOcrVlFull(): Promise { } return ensureService(IMAGES.paddleocrVlFull); } + +/** + * Ensure an Ollama model is pulled and available + * Uses the MiniCPM container (which runs Ollama) to pull the model + */ +export async function ensureOllamaModel(modelName: string): Promise { + const OLLAMA_URL = 'http://localhost:11434'; + + console.log(`\n[Ollama] Ensuring model: ${modelName}`); + + // Check if model exists + try { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + if (response.ok) { + const data = await response.json(); + const models = data.models || []; + const exists = models.some((m: { name: string }) => + m.name === modelName || m.name.startsWith(modelName.split(':')[0]) + ); + + if (exists) { + console.log(`[Ollama] Model already available: ${modelName}`); + return true; + } + } + } catch { + console.log(`[Ollama] Cannot check models, Ollama may not be running`); + return false; + } + + // Pull the model + console.log(`[Ollama] Pulling model: ${modelName} (this may take a while)...`); + try { + const response = await fetch(`${OLLAMA_URL}/api/pull`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ name: modelName, stream: false }), + }); + + if (response.ok) { + console.log(`[Ollama] Model pulled successfully: ${modelName}`); + return true; + } else { + console.log(`[Ollama] Failed to pull model: ${response.status}`); + return false; + } + } catch (err) { + console.log(`[Ollama] Error pulling model: ${err}`); + return false; + } +} + +/** + * Ensure Qwen2.5 7B model is available (for text-only JSON extraction) + */ +export async function ensureQwen25(): Promise { + // First ensure the Ollama service (MiniCPM container) is running + const ollamaOk = await ensureMiniCpm(); + if (!ollamaOk) return false; + + // Then ensure the Qwen2.5 model is pulled + return ensureOllamaModel('qwen2.5:7b'); +} diff --git a/test/test.invoices.paddleocr-vl.ts b/test/test.invoices.paddleocr-vl.ts index 7c2ff31..448f624 100644 --- a/test/test.invoices.paddleocr-vl.ts +++ b/test/test.invoices.paddleocr-vl.ts @@ -15,11 +15,12 @@ import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; -import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; +import { ensurePaddleOcrVlFull, ensureQwen25 } from './helpers/docker.js'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; const OLLAMA_URL = 'http://localhost:11434'; -const MINICPM_MODEL = 'minicpm-v:latest'; +// Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused) +const TEXT_MODEL = 'qwen2.5:7b'; interface IInvoice { invoice_number: string; @@ -87,42 +88,45 @@ async function parseDocument(imageBase64: string): Promise { } /** - * Extract invoice fields from structured Markdown using MiniCPM with image context + * Extract invoice fields from structured Markdown using Qwen2.5 (text-only model) */ -async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise { +async function extractInvoiceFromMarkdown(markdown: string): Promise { // Truncate if too long - const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown; + const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown; console.log(` [Extract] Processing ${truncated.length} chars of Markdown`); - const prompt = `/nothink -You are an invoice parser. Extract fields from this invoice image. + const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object. Required fields: -- invoice_number: The invoice/receipt number -- invoice_date: Date in YYYY-MM-DD format +- invoice_number: The invoice/receipt/document number +- invoice_date: Date in YYYY-MM-DD format (convert from any format) - vendor_name: Company that issued the invoice -- currency: EUR, USD, etc. -- net_amount: Amount before tax -- vat_amount: Tax/VAT amount (0 if reverse charge) -- total_amount: Final amount due +- currency: EUR, USD, GBP, etc. +- net_amount: Amount before tax (number) +- vat_amount: Tax/VAT amount (number, use 0 if reverse charge or not shown) +- total_amount: Final total amount (number) -Return ONLY a JSON object like: -{"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99} +Example output format: +{"invoice_number":"INV-123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99} -Use null for missing strings, 0 for missing numbers. No explanation. +Rules: +- Return ONLY the JSON object, no explanation or markdown +- Use null for missing string fields +- Use 0 for missing numeric fields +- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28") +- Extract numbers without currency symbols -OCR text from the invoice (for reference): ---- +OCR Text: ${truncated} ----`; + +JSON:`; const payload = { - model: MINICPM_MODEL, + model: TEXT_MODEL, prompt, - images, // Send the actual image to MiniCPM stream: true, options: { - num_predict: 2048, + num_predict: 512, temperature: 0.1, }, }; @@ -173,26 +177,41 @@ ${truncated} } const jsonStr = fullText.substring(startIdx, endIdx); - return JSON.parse(jsonStr); + const parsed = JSON.parse(jsonStr); + + // Ensure numeric fields are actually numbers + return { + invoice_number: parsed.invoice_number || null, + invoice_date: parsed.invoice_date || null, + vendor_name: parsed.vendor_name || null, + currency: parsed.currency || 'EUR', + net_amount: parseFloat(parsed.net_amount) || 0, + vat_amount: parseFloat(parsed.vat_amount) || 0, + total_amount: parseFloat(parsed.total_amount) || 0, + }; } /** - * Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM + * Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only) */ async function extractOnce(images: string[], passNum: number): Promise { - // Parse document with full pipeline + // Parse document with full pipeline (PaddleOCR-VL) const markdown = await parseDocument(images[0]); console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); - // Extract invoice fields from Markdown with image context - return extractInvoiceFromMarkdown(markdown, images); + // Extract invoice fields from Markdown using text-only model (no images) + return extractInvoiceFromMarkdown(markdown); } /** * Create a hash of invoice for comparison (using key fields) */ function hashInvoice(invoice: IInvoice): string { - return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; + // Ensure total_amount is a number + const amount = typeof invoice.total_amount === 'number' + ? invoice.total_amount.toFixed(2) + : String(invoice.total_amount || 0); + return `${invoice.invoice_number}|${invoice.invoice_date}|${amount}`; } /** @@ -243,6 +262,43 @@ async function extractWithConsensus(images: string[], invoiceName: string, maxPa return best.invoice; } +/** + * Normalize date to YYYY-MM-DD format + */ +function normalizeDate(dateStr: string | null): string { + if (!dateStr) return ''; + + // Already in correct format + if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) { + return dateStr; + } + + // Handle DD-MMM-YYYY format (e.g., "28-JUN-2022") + const monthMap: Record = { + JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', + JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', + }; + + const match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); + if (match) { + const day = match[1].padStart(2, '0'); + const month = monthMap[match[2].toUpperCase()] || '01'; + const year = match[3]; + return `${year}-${month}-${day}`; + } + + // Handle DD/MM/YYYY or DD.MM.YYYY + const match2 = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); + if (match2) { + const day = match2[1].padStart(2, '0'); + const month = match2[2].padStart(2, '0'); + const year = match2[3]; + return `${year}-${month}-${day}`; + } + + return dateStr; +} + /** * Compare extracted invoice against expected */ @@ -259,8 +315,10 @@ function compareInvoice( errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); } - // Compare date - if (extracted.invoice_date !== expected.invoice_date) { + // Compare date (normalize format first) + const extDate = normalizeDate(extracted.invoice_date); + const expDate = normalizeDate(expected.invoice_date); + if (extDate !== expDate) { errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); } @@ -317,9 +375,9 @@ tap.test('setup: ensure Docker containers are running', async () => { const paddleOk = await ensurePaddleOcrVlFull(); expect(paddleOk).toBeTrue(); - // Ensure MiniCPM is running (for field extraction from Markdown) - const minicpmOk = await ensureMiniCpm(); - expect(minicpmOk).toBeTrue(); + // Ensure Qwen2.5 is available (for text-only JSON extraction) + const qwenOk = await ensureQwen25(); + expect(qwenOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); @@ -380,7 +438,7 @@ tap.test('summary', async () => { console.log(`\n======================================================`); console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`); console.log(`======================================================`); - console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); + console.log(` Method: PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`); console.log(` Passed: ${passedCount}/${totalInvoices}`); console.log(` Failed: ${failedCount}/${totalInvoices}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`);