From 7c8f10497eb7c46ef8a065dc4f7d691c576016de Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sun, 18 Jan 2026 04:17:30 +0000 Subject: [PATCH] fix(tests): improve Qwen3-VL invoice extraction test by switching to non-stream API, adding model availability/pull checks, simplifying response parsing, and tightening model options --- changelog.md | 10 ++ test/test.invoices.qwen3vl.ts | 213 ++++++++++++++-------------------- 2 files changed, 96 insertions(+), 127 deletions(-) diff --git a/changelog.md b/changelog.md index 7e016f7..8ce055c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-18 - 1.10.1 - fix(tests) +improve Qwen3-VL invoice extraction test by switching to non-stream API, adding model availability/pull checks, simplifying response parsing, and tightening model options + +- Replaced streaming reader logic with direct JSON parsing of the /api/chat response +- Added ensureQwen3Vl() to check and pull the Qwen3-VL:8b model from Ollama +- Switched to ensureMiniCpm() to verify Ollama service is running before model checks +- Use /no_think prompt for direct JSON output and set temperature to 0.0 and num_predict to 512 +- Removed retry loop and streaming parsing; improved error messages to include response body +- Updated logging and test setup messages for clarity + ## 2026-01-18 - 1.10.0 - feat(vision) add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting diff --git a/test/test.invoices.qwen3vl.ts b/test/test.invoices.qwen3vl.ts index e42cdd1..3b21d13 100644 --- a/test/test.invoices.qwen3vl.ts +++ b/test/test.invoices.qwen3vl.ts @@ -1,18 +1,17 @@ /** - * Invoice extraction using Qwen3-VL-8B Vision (Direct) + * Invoice extraction using Qwen3-VL 8B Vision (Direct) * - * Qwen3-VL 8B is a capable vision-language model that fits in 15GB VRAM: - * - Q4_K_M quantization (~5GB) - * - Good balance of speed and accuracy + * Single-step pipeline: PDF → Images → Qwen3-VL → JSON + * Uses /no_think to disable reasoning mode for fast, direct responses. * - * Pipeline: PDF → Images → Qwen3-VL → JSON + * Qwen3-VL outperforms PaddleOCR-VL on certain invoice formats. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; -import { ensureQwen3Vl } from './helpers/docker.js'; +import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; const VISION_MODEL = 'qwen3-vl:8b'; @@ -57,144 +56,68 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Single extraction attempt + * Extract invoice data directly from images using Qwen3-VL Vision + * Uses /no_think to disable reasoning mode for fast, direct JSON output */ -async function tryExtractOnce(images: string[], prompt: string): Promise { +async function extractInvoiceFromImages(images: string[]): Promise { + console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); + + // /no_think disables Qwen3's reasoning mode - crucial for getting direct output + const prompt = `/no_think +Look at this invoice and extract these fields. Reply with ONLY JSON, no explanation. + +- invoice_number +- invoice_date (format: YYYY-MM-DD) +- vendor_name +- currency (EUR, USD, or GBP) +- net_amount +- vat_amount +- total_amount + +JSON: {"invoice_number":"...","invoice_date":"YYYY-MM-DD","vendor_name":"...","currency":"EUR","net_amount":0,"vat_amount":0,"total_amount":0}`; + const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: VISION_MODEL, - messages: [ - { - role: 'user', - content: prompt, - images: images, - }, - ], - stream: true, + messages: [{ + role: 'user', + content: prompt, + images: images, // Pass all pages + }], + stream: false, options: { - num_predict: 1024, - temperature: 0.1, // Slight randomness helps avoid stuck states + num_predict: 512, + temperature: 0.0, }, }), }); if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); + const err = await response.text(); + throw new Error(`Ollama API error: ${response.status} - ${err}`); } - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } + const data = await response.json(); + let content = data.message?.content || ''; - const decoder = new TextDecoder(); - let fullText = ''; + console.log(` [Vision] Response (${content.length} chars): ${content.substring(0, 200)}...`); - while (true) { - const { done, value } = await reader.read(); - if (done) break; + // Parse JSON from response + if (content.startsWith('```json')) content = content.slice(7); + else if (content.startsWith('```')) content = content.slice(3); + if (content.endsWith('```')) content = content.slice(0, -3); + content = content.trim(); - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.message?.content) { - fullText += json.message.content; - } - } catch { - // Skip invalid JSON lines - } - } - } - - return fullText; -} - -/** - * Extract invoice data directly from images using Qwen3-VL Vision - * Includes retry logic for empty responses - */ -async function extractInvoiceFromImages(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); - - // JSON schema for structured output - force the model to output valid JSON - const invoiceSchema = { - type: 'object', - properties: { - invoice_number: { type: 'string' }, - invoice_date: { type: 'string' }, - vendor_name: { type: 'string' }, - currency: { type: 'string' }, - net_amount: { type: 'number' }, - vat_amount: { type: 'number' }, - total_amount: { type: 'number' }, - }, - required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], - }; - - // Simple, direct prompt - don't overthink, just read the labeled fields - const prompt = `Extract invoice data from this image. Return JSON only. - -Find these fields: -- invoice_number: The invoice/document number -- invoice_date: Date in YYYY-MM-DD format -- vendor_name: Company issuing the invoice -- currency: EUR, USD, or GBP -- net_amount: Amount before tax -- vat_amount: Tax/VAT amount -- total_amount: Final total amount - -Return: {"invoice_number":"...", "invoice_date":"YYYY-MM-DD", "vendor_name":"...", "currency":"EUR", "net_amount":0.00, "vat_amount":0.00, "total_amount":0.00}`; - - // Retry logic for empty responses (model sometimes returns nothing) - const MAX_RETRIES = 3; - let fullText = ''; - - for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { - fullText = await tryExtractOnce(images, prompt); - - if (fullText.trim().length > 0) { - console.log(` [Attempt ${attempt}] Got ${fullText.length} chars`); - break; - } - - console.log(` [Attempt ${attempt}] Empty response, retrying...`); - // Small delay before retry - await new Promise((r) => setTimeout(r, 1000)); - } - - if (fullText.trim().length === 0) { - throw new Error(`Model returned empty response after ${MAX_RETRIES} attempts`); - } - - // Parse JSON response - let jsonStr = fullText.trim(); - - if (jsonStr.startsWith('```json')) jsonStr = jsonStr.slice(7); - else if (jsonStr.startsWith('```')) jsonStr = jsonStr.slice(3); - if (jsonStr.endsWith('```')) jsonStr = jsonStr.slice(0, -3); - jsonStr = jsonStr.trim(); - - const startIdx = jsonStr.indexOf('{'); - const endIdx = jsonStr.lastIndexOf('}') + 1; + const startIdx = content.indexOf('{'); + const endIdx = content.lastIndexOf('}') + 1; if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON found in: ${fullText.substring(0, 500)}`); + throw new Error(`No JSON found: ${content.substring(0, 300)}`); } - const extractedJson = jsonStr.substring(startIdx, endIdx); - console.log(` [Debug] Extracted JSON: ${extractedJson.substring(0, 200)}...`); - - let parsed; - try { - parsed = JSON.parse(extractedJson); - } catch (e) { - throw new Error(`Invalid JSON: ${extractedJson.substring(0, 500)}`); - } + const parsed = JSON.parse(content.substring(startIdx, endIdx)); return { invoice_number: parsed.invoice_number || null, @@ -284,12 +207,48 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin return testCases.sort((a, b) => a.name.localeCompare(b.name)); } +/** + * Ensure Qwen3-VL 8B model is available + */ +async function ensureQwen3Vl(): Promise { + try { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + if (response.ok) { + const data = await response.json(); + const models = data.models || []; + if (models.some((m: { name: string }) => m.name === VISION_MODEL)) { + console.log(`[Ollama] Model already available: ${VISION_MODEL}`); + return true; + } + } + } catch { + console.log('[Ollama] Cannot check models'); + return false; + } + + console.log(`[Ollama] Pulling model: ${VISION_MODEL}...`); + const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ name: VISION_MODEL, stream: false }), + }); + + return pullResponse.ok; +} + // Tests tap.test('setup: ensure Qwen3-VL is running', async () => { - console.log('\n[Setup] Checking Qwen3-VL 8B (~5GB)...\n'); - const ok = await ensureQwen3Vl(); - expect(ok).toBeTrue(); + console.log('\n[Setup] Checking Qwen3-VL 8B...\n'); + + // Ensure Ollama service is running + const ollamaOk = await ensureMiniCpm(); + expect(ollamaOk).toBeTrue(); + + // Ensure Qwen3-VL 8B model + const visionOk = await ensureQwen3Vl(); + expect(visionOk).toBeTrue(); + console.log('\n[Setup] Ready!\n'); }); @@ -339,7 +298,7 @@ tap.test('summary', async () => { console.log(`\n======================================================`); console.log(` Invoice Extraction Summary (Qwen3-VL Vision)`); console.log(`======================================================`); - console.log(` Method: Qwen3-VL 8B (Direct Vision)`); + console.log(` Method: Qwen3-VL 8B Direct Vision (/no_think)`); console.log(` Passed: ${passedCount}/${total}`); console.log(` Failed: ${failedCount}/${total}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`);