/** * Bank statement extraction using MiniCPM-V (visual extraction) * * JSON per-page approach with streaming output: * 1. Ask for structured JSON of all transactions per page * 2. Single pass extraction (no consensus) */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; const MODEL = 'openbmb/minicpm-v4.5:q8_0'; interface ITransaction { date: string; counterparty: string; amount: number; } const JSON_PROMPT = `Extract ALL transactions from this bank statement page as a JSON array. IMPORTANT RULES: 1. Each transaction has: date, description/counterparty, and an amount 2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals) 3. Amount is POSITIVE for money coming IN (credits, deposits, refunds) 4. Date format: YYYY-MM-DD 5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows 6. Only include actual transactions with a specific date and amount Return ONLY this JSON format, no explanation: [ {"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99}, {"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00} ]`; /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Query for JSON extraction with streaming output */ async function queryJson(image: string, queryId: string): Promise { const startTime = Date.now(); process.stdout.write(` [${queryId}] `); const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: MODEL, messages: [{ role: 'user', content: JSON_PROMPT, images: [image], }], stream: true, options: { num_ctx: 32768, num_predict: 4000, temperature: 0.1, }, }), }); if (!response.ok) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); process.stdout.write(`ERROR: ${response.status} (${elapsed}s)\n`); throw new Error(`Ollama API error: ${response.status}`); } let content = ''; const reader = response.body!.getReader(); const decoder = new TextDecoder(); try { while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); for (const line of chunk.split('\n').filter(l => l.trim())) { try { const json = JSON.parse(line); const token = json.message?.content || ''; if (token) { process.stdout.write(token); content += token; } } catch { // Ignore parse errors for partial chunks } } } } finally { const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); process.stdout.write(` (${elapsed}s)\n`); } return content.trim(); } /** * Sanitize JSON string - fix common issues from vision model output */ function sanitizeJson(jsonStr: string): string { let s = jsonStr; // Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix // Handle various whitespace patterns s = s.replace(/"amount"\s*:\s*\+/g, '"amount": '); s = s.replace(/:\s*\+(\d)/g, ': $1'); // Fix European number format with thousands separator (e.g., 1.000.00 -> 1000.00) // Pattern: "amount": X.XXX.XX where X.XXX is thousands and .XX is decimal s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4'); // Also handle larger numbers like 10.000.00 s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3$4.$5'); // Fix trailing commas before ] or } s = s.replace(/,\s*([}\]])/g, '$1'); // Fix unescaped newlines inside strings (replace with space) s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"'); // Fix unescaped tabs inside strings s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"'); // Fix unescaped backslashes (but not already escaped ones) s = s.replace(/\\(?!["\\/bfnrtu])/g, '\\\\'); // Fix common issues with counterparty names containing special chars s = s.replace(/"counterparty":\s*"([^"]*)'([^"]*)"/g, '"counterparty": "$1$2"'); // Remove control characters except newlines (which we handle above) s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' '); return s; } /** * Parse JSON response into transactions */ function parseJsonResponse(response: string, queryId: string): ITransaction[] { console.log(` [${queryId}] Parsing response...`); // Try to find JSON in markdown code block const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/); let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim(); if (codeBlockMatch) { console.log(` [${queryId}] Found JSON in code block`); } // Sanitize JSON (fix +number issue) jsonStr = sanitizeJson(jsonStr); try { const parsed = JSON.parse(jsonStr); if (Array.isArray(parsed)) { const txs = parsed.map(tx => ({ date: String(tx.date || ''), counterparty: String(tx.counterparty || tx.description || ''), amount: parseAmount(tx.amount), })); console.log(` [${queryId}] Parsed ${txs.length} transactions (direct)`); return txs; } console.log(` [${queryId}] Parsed JSON is not an array`); } catch (e) { const errMsg = (e as Error).message; console.log(` [${queryId}] Direct parse failed: ${errMsg}`); // Log problematic section with context const posMatch = errMsg.match(/position (\d+)/); if (posMatch) { const pos = parseInt(posMatch[1]); const start = Math.max(0, pos - 40); const end = Math.min(jsonStr.length, pos + 40); const context = jsonStr.substring(start, end); const marker = ' '.repeat(pos - start) + '^'; console.log(` [${queryId}] Context around error position ${pos}:`); console.log(` [${queryId}] ...${context}...`); console.log(` [${queryId}] ${marker}`); } // Try to find JSON array pattern const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); if (arrayMatch) { console.log(` [${queryId}] Found array pattern, trying to parse...`); const sanitizedArray = sanitizeJson(arrayMatch[0]); try { const parsed = JSON.parse(sanitizedArray); if (Array.isArray(parsed)) { const txs = parsed.map(tx => ({ date: String(tx.date || ''), counterparty: String(tx.counterparty || tx.description || ''), amount: parseAmount(tx.amount), })); console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`); return txs; } } catch (e2) { const errMsg2 = (e2 as Error).message; console.log(` [${queryId}] Array parse failed: ${errMsg2}`); const posMatch2 = errMsg2.match(/position (\d+)/); if (posMatch2) { const pos2 = parseInt(posMatch2[1]); console.log(` [${queryId}] Context around error: ...${sanitizedArray.substring(Math.max(0, pos2 - 30), pos2 + 30)}...`); } // Try to extract individual objects from the malformed array console.log(` [${queryId}] Attempting object-by-object extraction...`); const extracted = extractTransactionsFromMalformedJson(sanitizedArray, queryId); if (extracted.length > 0) { console.log(` [${queryId}] Recovered ${extracted.length} transactions via object extraction`); return extracted; } } } else { console.log(` [${queryId}] No array pattern found in response`); console.log(` [${queryId}] Raw response preview: ${response.substring(0, 200)}...`); } } console.log(` [${queryId}] PARSE FAILED - returning empty array`); return []; } /** * Extract transactions from malformed JSON by parsing objects individually */ function extractTransactionsFromMalformedJson(jsonStr: string, queryId: string): ITransaction[] { const transactions: ITransaction[] = []; // Match individual transaction objects const objectPattern = /\{\s*"date"\s*:\s*"([^"]+)"\s*,\s*"counterparty"\s*:\s*"([^"]+)"\s*,\s*"amount"\s*:\s*([+-]?\d+\.?\d*)\s*\}/g; let match; while ((match = objectPattern.exec(jsonStr)) !== null) { transactions.push({ date: match[1], counterparty: match[2], amount: parseFloat(match[3]), }); } // Also try with different field orders (amount before counterparty, etc.) if (transactions.length === 0) { const altPattern = /\{\s*"date"\s*:\s*"([^"]+)"[^}]*"amount"\s*:\s*([+-]?\d+\.?\d*)[^}]*\}/g; while ((match = altPattern.exec(jsonStr)) !== null) { // Try to extract counterparty from the match const counterpartyMatch = match[0].match(/"counterparty"\s*:\s*"([^"]+)"/); const descMatch = match[0].match(/"description"\s*:\s*"([^"]+)"/); transactions.push({ date: match[1], counterparty: counterpartyMatch?.[1] || descMatch?.[1] || 'UNKNOWN', amount: parseFloat(match[2]), }); } } return transactions; } /** * Parse amount from various formats */ function parseAmount(value: unknown): number { if (typeof value === 'number') return value; if (typeof value !== 'string') return 0; let s = value.replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); // European format: comma is decimal if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) { s = s.replace(/\./g, '').replace(',', '.'); } else { s = s.replace(/,/g, ''); } return parseFloat(s) || 0; } /** * Extract transactions from a single page (single pass) */ async function extractTransactionsFromPage(image: string, pageNum: number): Promise { console.log(`\n ======== Page ${pageNum} ========`); const queryId = `P${pageNum}`; const response = await queryJson(image, queryId); const transactions = parseJsonResponse(response, queryId); console.log(` [Page ${pageNum}] Extracted ${transactions.length} transactions:`); for (let i = 0; i < transactions.length; i++) { const tx = transactions[i]; console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); } return transactions; } /** * Extract all transactions from bank statement */ async function extractTransactions(images: string[]): Promise { console.log(` [Vision] Processing ${images.length} page(s) with ${MODEL} (single pass)`); const allTransactions: ITransaction[] = []; for (let i = 0; i < images.length; i++) { const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); allTransactions.push(...pageTransactions); } console.log(` [Vision] Total: ${allTransactions.length} transactions`); return allTransactions; } /** * Compare extracted transactions against expected */ function compareTransactions( extracted: ITransaction[], expected: ITransaction[] ): { matches: number; total: number; errors: string[]; variations: string[] } { const errors: string[] = []; const variations: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { const exp = expected[i]; const ext = extracted[i]; if (!ext) { errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); continue; } const dateMatch = ext.date === exp.date; const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; if (dateMatch && amountMatch) { matches++; // Track counterparty variations (date and amount match but name differs) if (ext.counterparty !== exp.counterparty) { variations.push( `[${i}] "${exp.counterparty}" → "${ext.counterparty}"` ); } } else { errors.push( `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}` ); } } if (extracted.length > expected.length) { errors.push(`Extra transactions: ${extracted.length - expected.length}`); } return { matches, total: expected.length, errors, variations }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } return testCases.sort((a, b) => a.name.localeCompare(b.name)); } // Tests tap.test('setup: ensure Docker containers are running', async () => { console.log('\n[Setup] Checking Docker containers...\n'); const minicpmOk = await ensureMiniCpm(); expect(minicpmOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); tap.test('should have MiniCPM-V model loaded', async () => { const response = await fetch(`${OLLAMA_URL}/api/tags`); const data = await response.json(); const modelNames = data.models.map((m: { name: string }) => m.name); expect(modelNames.some((name: string) => name.includes('minicpm'))).toBeTrue(); }); const testCases = findTestCases(); console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V)\n`); let passedCount = 0; let failedCount = 0; for (const testCase of testCases) { tap.test(`should extract: ${testCase.name}`, async () => { const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.length} transactions`); const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); const extracted = await extractTransactions(images); console.log(` Extracted: ${extracted.length} transactions`); const result = compareTransactions(extracted, expected); const perfectMatch = result.matches === result.total && extracted.length === expected.length; if (perfectMatch) { passedCount++; console.log(` Result: PASS (${result.matches}/${result.total})`); } else { failedCount++; console.log(` Result: FAIL (${result.matches}/${result.total})`); result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`)); } // Log counterparty variations (names that differ but date/amount matched) if (result.variations.length > 0) { console.log(` Counterparty variations (${result.variations.length}):`); result.variations.forEach((v) => console.log(` ${v}`)); } expect(result.matches).toEqual(result.total); expect(extracted.length).toEqual(expected.length); }); } tap.test('summary', async () => { const total = testCases.length; console.log(`\n======================================================`); console.log(` Bank Statement Summary (${MODEL})`); console.log(`======================================================`); console.log(` Method: JSON per-page (single pass)`); console.log(` Passed: ${passedCount}/${total}`); console.log(` Failed: ${failedCount}/${total}`); console.log(`======================================================\n`); }); export default tap.start();