/** * Bank statement extraction using Qwen3-VL 8B Vision (Direct) * * Multi-query approach: * 1. First ask how many transactions on each page * 2. Then query each transaction individually * Single pass, no consensus voting. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; const VISION_MODEL = 'qwen3-vl:8b'; interface ITransaction { date: string; counterparty: string; amount: number; } /** * Convert PDF to PNG images */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Query Qwen3-VL with a simple prompt */ async function queryVision(image: string, prompt: string): Promise { const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: VISION_MODEL, messages: [{ role: 'user', content: prompt, images: [image], }], stream: false, options: { num_predict: 500, temperature: 0.1, }, }), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const data = await response.json(); return (data.message?.content || '').trim(); } /** * Count transactions on a page */ async function countTransactions(image: string, pageNum: number): Promise { const response = await queryVision(image, `How many transaction rows are in this bank statement table? Count only the data rows (with dates like "01.01.2024" and amounts like "- 50,00 €"). Do NOT count the header row or summary/total rows. Answer with just the number, for example: 7` ); console.log(` [Page ${pageNum}] Count query response: "${response}"`); const match = response.match(/(\d+)/); const count = match ? parseInt(match[1], 10) : 0; console.log(` [Page ${pageNum}] Parsed count: ${count}`); return count; } /** * Get a single transaction by index (logs immediately when complete) */ async function getTransaction(image: string, index: number, pageNum: number): Promise { const response = await queryVision(image, `This is a bank statement. Look at transaction row #${index} in the table (counting from top, excluding headers). Extract this transaction's details: - Date in YYYY-MM-DD format - Counterparty/description name - Amount as number (negative for debits like "- 21,47 €" = -21.47, positive for credits like "+ 100,00 €" = 100.00) Answer in format: DATE|COUNTERPARTY|AMOUNT Example: 2024-01-15|Amazon|−25.99` ); // Parse the response const lines = response.split('\n').filter(l => l.includes('|')); const line = lines[lines.length - 1] || response; const parts = line.split('|').map(p => p.trim()); if (parts.length >= 3) { // Parse amount - handle various formats let amountStr = parts[2].replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); // European format: comma is decimal if (amountStr.includes(',')) { amountStr = amountStr.replace(/\./g, '').replace(',', '.'); } const amount = parseFloat(amountStr) || 0; const tx = { date: parts[0], counterparty: parts[1], amount: amount, }; // Log immediately as this transaction completes console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] ${tx.date} | ${tx.counterparty.substring(0, 25).padEnd(25)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); return tx; } // Log raw response on parse failure console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] PARSE FAILED: "${response.replace(/\n/g, ' ').substring(0, 60)}..."`); return null; } /** * Extract transactions from a single page using multi-query approach */ async function extractTransactionsFromPage(image: string, pageNum: number): Promise { // Step 1: Count transactions const count = await countTransactions(image, pageNum); if (count === 0) { return []; } // Step 2: Query each transaction (in batches to avoid overwhelming) // Each transaction logs itself as it completes const transactions: ITransaction[] = []; const batchSize = 5; for (let start = 1; start <= count; start += batchSize) { const end = Math.min(start + batchSize - 1, count); const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i); // Query batch in parallel - each logs as it completes const results = await Promise.all( indices.map(i => getTransaction(image, i, pageNum)) ); for (const tx of results) { if (tx) { transactions.push(tx); } } } console.log(` [Page ${pageNum}] Complete: ${transactions.length}/${count} extracted`); return transactions; } /** * Extract all transactions from bank statement */ async function extractTransactions(images: string[]): Promise { console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL (multi-query)`); const allTransactions: ITransaction[] = []; for (let i = 0; i < images.length; i++) { const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); allTransactions.push(...pageTransactions); } console.log(` [Vision] Total: ${allTransactions.length} transactions`); return allTransactions; } /** * Compare transactions */ function compareTransactions( extracted: ITransaction[], expected: ITransaction[] ): { matches: number; total: number; errors: string[] } { const errors: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { const exp = expected[i]; const ext = extracted[i]; if (!ext) { errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); continue; } const dateMatch = ext.date === exp.date; const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; if (dateMatch && amountMatch) { matches++; } else { errors.push(`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`); } } if (extracted.length > expected.length) { errors.push(`Extra transactions: ${extracted.length - expected.length}`); } return { matches, total: expected.length, errors }; } /** * Find test cases in .nogit/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit'); if (!fs.existsSync(testDir)) return []; const files = fs.readdirSync(testDir); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of files.filter((f: string) => f.endsWith('.pdf'))) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } return testCases.sort((a, b) => a.name.localeCompare(b.name)); } /** * Ensure Qwen3-VL model is available */ async function ensureQwen3Vl(): Promise { try { const response = await fetch(`${OLLAMA_URL}/api/tags`); if (response.ok) { const data = await response.json(); const models = data.models || []; if (models.some((m: { name: string }) => m.name === VISION_MODEL)) { console.log(`[Ollama] Model available: ${VISION_MODEL}`); return true; } } } catch { return false; } console.log(`[Ollama] Pulling ${VISION_MODEL}...`); const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: VISION_MODEL, stream: false }), }); return pullResponse.ok; } // Tests tap.test('setup: ensure Qwen3-VL is running', async () => { console.log('\n[Setup] Checking Qwen3-VL 8B...\n'); const ollamaOk = await ensureMiniCpm(); expect(ollamaOk).toBeTrue(); const visionOk = await ensureQwen3Vl(); expect(visionOk).toBeTrue(); console.log('\n[Setup] Ready!\n'); }); const testCases = findTestCases(); console.log(`\nFound ${testCases.length} bank statement test cases (Qwen3-VL)\n`); let passedCount = 0; let failedCount = 0; for (const testCase of testCases) { tap.test(`should extract: ${testCase.name}`, async () => { const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.length} transactions`); const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); const extracted = await extractTransactions(images); console.log(` Extracted: ${extracted.length} transactions`); const result = compareTransactions(extracted, expected); const accuracy = result.total > 0 ? result.matches / result.total : 0; if (accuracy >= 0.95 && extracted.length === expected.length) { passedCount++; console.log(` Result: PASS (${result.matches}/${result.total})`); } else { failedCount++; console.log(` Result: FAIL (${result.matches}/${result.total})`); result.errors.slice(0, 5).forEach((e) => console.log(` - ${e}`)); } expect(accuracy).toBeGreaterThan(0.95); expect(extracted.length).toEqual(expected.length); }); } tap.test('summary', async () => { const total = testCases.length; console.log(`\n======================================================`); console.log(` Bank Statement Summary (Qwen3-VL Vision)`); console.log(`======================================================`); console.log(` Method: Multi-query (count then extract each)`); console.log(` Passed: ${passedCount}/${total}`); console.log(` Failed: ${failedCount}/${total}`); console.log(`======================================================\n`); }); export default tap.start();