/** * Bank statement extraction using Qwen3-VL 8B Vision (Direct) * * Single-step pipeline: PDF → Images → Qwen3-VL → JSON * * Key insights: * - Use /no_think in prompt + think:false in API to disable reasoning * - Need high num_predict (8000+) for many transactions * - Single pass extraction, no consensus needed */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; const VISION_MODEL = 'qwen3-vl:8b'; interface ITransaction { date: string; counterparty: string; amount: number; } /** * Convert PDF to PNG images */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Extract transactions from a single page * Processes one page at a time to minimize thinking tokens */ async function extractTransactionsFromPage(image: string, pageNum: number): Promise { const prompt = `/no_think Extract transactions from this bank statement page. Amount: "- 21,47 €" = -21.47, "+ 1.000,00 €" = 1000.00 (European format) Return JSON array only: [{"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47},...]`; const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: VISION_MODEL, messages: [{ role: 'user', content: prompt, images: [image], }], stream: false, think: false, options: { num_predict: 4000, temperature: 0.1, }, }), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const data = await response.json(); let content = data.message?.content || ''; if (!content) { console.log(` [Page ${pageNum}] Empty response`); return []; } // Parse JSON array if (content.startsWith('```json')) content = content.slice(7); else if (content.startsWith('```')) content = content.slice(3); if (content.endsWith('```')) content = content.slice(0, -3); content = content.trim(); const startIdx = content.indexOf('['); const endIdx = content.lastIndexOf(']') + 1; if (startIdx < 0 || endIdx <= startIdx) { console.log(` [Page ${pageNum}] No JSON array found`); return []; } try { const transactions = JSON.parse(content.substring(startIdx, endIdx)); console.log(` [Page ${pageNum}] Found ${transactions.length} transactions`); return transactions; } catch { console.log(` [Page ${pageNum}] JSON parse error`); return []; } } /** * Extract transactions using Qwen3-VL vision * Processes each page separately to avoid thinking token exhaustion */ async function extractTransactions(images: string[]): Promise { console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`); const allTransactions: ITransaction[] = []; // Process pages sequentially to avoid overwhelming the model for (let i = 0; i < images.length; i++) { const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); allTransactions.push(...pageTransactions); } console.log(` [Vision] Total: ${allTransactions.length} transactions`); return allTransactions; } /** * Compare transactions */ function compareTransactions( extracted: ITransaction[], expected: ITransaction[] ): { matches: number; total: number; errors: string[] } { const errors: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { const exp = expected[i]; const ext = extracted[i]; if (!ext) { errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); continue; } const dateMatch = ext.date === exp.date; const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; if (dateMatch && amountMatch) { matches++; } else { errors.push(`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`); } } if (extracted.length > expected.length) { errors.push(`Extra transactions: ${extracted.length - expected.length}`); } return { matches, total: expected.length, errors }; } /** * Find test cases in .nogit/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit'); if (!fs.existsSync(testDir)) return []; const files = fs.readdirSync(testDir); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of files.filter((f: string) => f.endsWith('.pdf'))) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } return testCases.sort((a, b) => a.name.localeCompare(b.name)); } /** * Ensure Qwen3-VL model is available */ async function ensureQwen3Vl(): Promise { try { const response = await fetch(`${OLLAMA_URL}/api/tags`); if (response.ok) { const data = await response.json(); const models = data.models || []; if (models.some((m: { name: string }) => m.name === VISION_MODEL)) { console.log(`[Ollama] Model available: ${VISION_MODEL}`); return true; } } } catch { return false; } console.log(`[Ollama] Pulling ${VISION_MODEL}...`); const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: VISION_MODEL, stream: false }), }); return pullResponse.ok; } // Tests tap.test('setup: ensure Qwen3-VL is running', async () => { console.log('\n[Setup] Checking Qwen3-VL 8B...\n'); const ollamaOk = await ensureMiniCpm(); expect(ollamaOk).toBeTrue(); const visionOk = await ensureQwen3Vl(); expect(visionOk).toBeTrue(); console.log('\n[Setup] Ready!\n'); }); const testCases = findTestCases(); console.log(`\nFound ${testCases.length} bank statement test cases (Qwen3-VL)\n`); let passedCount = 0; let failedCount = 0; for (const testCase of testCases) { tap.test(`should extract: ${testCase.name}`, async () => { const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.length} transactions`); const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); const extracted = await extractTransactions(images); console.log(` Extracted: ${extracted.length} transactions`); const result = compareTransactions(extracted, expected); const accuracy = result.total > 0 ? result.matches / result.total : 0; if (accuracy >= 0.95 && extracted.length === expected.length) { passedCount++; console.log(` Result: PASS (${result.matches}/${result.total})`); } else { failedCount++; console.log(` Result: FAIL (${result.matches}/${result.total})`); result.errors.slice(0, 5).forEach((e) => console.log(` - ${e}`)); } expect(accuracy).toBeGreaterThan(0.95); expect(extracted.length).toEqual(expected.length); }); } tap.test('summary', async () => { const total = testCases.length; console.log(`\n======================================================`); console.log(` Bank Statement Summary (Qwen3-VL Vision)`); console.log(`======================================================`); console.log(` Passed: ${passedCount}/${total}`); console.log(` Failed: ${failedCount}/${total}`); console.log(`======================================================\n`); }); export default tap.start();