Files
ht-docker-ai/test/test.bankstatements.qwen3vl.ts

346 lines
11 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Bank statement extraction using Qwen3-VL 8B Vision (Direct)
*
* Multi-query approach:
* 1. First ask how many transactions on each page
* 2. Then query each transaction individually
* Single pass, no consensus voting.
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensureMiniCpm } from './helpers/docker.js';
const OLLAMA_URL = 'http://localhost:11434';
const VISION_MODEL = 'qwen3-vl:8b';
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
/**
* Convert PDF to PNG images
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Query Qwen3-VL with a simple prompt
*/
async function queryVision(image: string, prompt: string): Promise<string> {
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: VISION_MODEL,
messages: [{
role: 'user',
content: prompt,
images: [image],
}],
stream: false,
options: {
num_predict: 500,
temperature: 0.1,
},
}),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const data = await response.json();
return (data.message?.content || '').trim();
}
/**
* Count transactions on a page
*/
async function countTransactions(image: string, pageNum: number): Promise<number> {
const response = await queryVision(image,
`How many transaction rows are in this bank statement table?
Count only the data rows (with dates like "01.01.2024" and amounts like "- 50,00 €").
Do NOT count the header row or summary/total rows.
Answer with just the number, for example: 7`
);
console.log(` [Page ${pageNum}] Count query response: "${response}"`);
const match = response.match(/(\d+)/);
const count = match ? parseInt(match[1], 10) : 0;
console.log(` [Page ${pageNum}] Parsed count: ${count}`);
return count;
}
/**
* Get a single transaction by index (logs immediately when complete)
*/
async function getTransaction(image: string, index: number, pageNum: number): Promise<ITransaction | null> {
const response = await queryVision(image,
`This is a bank statement. Look at transaction row #${index} in the table (counting from top, excluding headers).
Extract this transaction's details:
- Date in YYYY-MM-DD format
- Counterparty/description name
- Amount as number (negative for debits like "- 21,47 €" = -21.47, positive for credits like "+ 100,00 €" = 100.00)
Answer in format: DATE|COUNTERPARTY|AMOUNT
Example: 2024-01-15|Amazon|25.99`
);
// Parse the response
const lines = response.split('\n').filter(l => l.includes('|'));
const line = lines[lines.length - 1] || response;
const parts = line.split('|').map(p => p.trim());
if (parts.length >= 3) {
// Parse amount - handle various formats
let amountStr = parts[2].replace(/[€$£\s]/g, '').replace('', '-').replace('', '-');
// European format: comma is decimal
if (amountStr.includes(',')) {
amountStr = amountStr.replace(/\./g, '').replace(',', '.');
}
const amount = parseFloat(amountStr) || 0;
const tx = {
date: parts[0],
counterparty: parts[1],
amount: amount,
};
// Log immediately as this transaction completes
console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] ${tx.date} | ${tx.counterparty.substring(0, 25).padEnd(25)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`);
return tx;
}
// Log raw response on parse failure
console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] PARSE FAILED: "${response.replace(/\n/g, ' ').substring(0, 60)}..."`);
return null;
}
/**
* Extract transactions from a single page using multi-query approach
*/
async function extractTransactionsFromPage(image: string, pageNum: number): Promise<ITransaction[]> {
// Step 1: Count transactions
const count = await countTransactions(image, pageNum);
if (count === 0) {
return [];
}
// Step 2: Query each transaction (in batches to avoid overwhelming)
// Each transaction logs itself as it completes
const transactions: ITransaction[] = [];
const batchSize = 5;
for (let start = 1; start <= count; start += batchSize) {
const end = Math.min(start + batchSize - 1, count);
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
// Query batch in parallel - each logs as it completes
const results = await Promise.all(
indices.map(i => getTransaction(image, i, pageNum))
);
for (const tx of results) {
if (tx) {
transactions.push(tx);
}
}
}
console.log(` [Page ${pageNum}] Complete: ${transactions.length}/${count} extracted`);
return transactions;
}
/**
* Extract all transactions from bank statement
*/
async function extractTransactions(images: string[]): Promise<ITransaction[]> {
console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL (multi-query)`);
const allTransactions: ITransaction[] = [];
for (let i = 0; i < images.length; i++) {
const pageTransactions = await extractTransactionsFromPage(images[i], i + 1);
allTransactions.push(...pageTransactions);
}
console.log(` [Vision] Total: ${allTransactions.length} transactions`);
return allTransactions;
}
/**
* Compare transactions
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[] } {
const errors: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
} else {
errors.push(`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors };
}
/**
* Find test cases in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) return [];
const files = fs.readdirSync(testDir);
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of files.filter((f: string) => f.endsWith('.pdf'))) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases.sort((a, b) => a.name.localeCompare(b.name));
}
/**
* Ensure Qwen3-VL model is available
*/
async function ensureQwen3Vl(): Promise<boolean> {
try {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
if (response.ok) {
const data = await response.json();
const models = data.models || [];
if (models.some((m: { name: string }) => m.name === VISION_MODEL)) {
console.log(`[Ollama] Model available: ${VISION_MODEL}`);
return true;
}
}
} catch {
return false;
}
console.log(`[Ollama] Pulling ${VISION_MODEL}...`);
const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: VISION_MODEL, stream: false }),
});
return pullResponse.ok;
}
// Tests
tap.test('setup: ensure Qwen3-VL is running', async () => {
console.log('\n[Setup] Checking Qwen3-VL 8B...\n');
const ollamaOk = await ensureMiniCpm();
expect(ollamaOk).toBeTrue();
const visionOk = await ensureQwen3Vl();
expect(visionOk).toBeTrue();
console.log('\n[Setup] Ready!\n');
});
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} bank statement test cases (Qwen3-VL)\n`);
let passedCount = 0;
let failedCount = 0;
for (const testCase of testCases) {
tap.test(`should extract: ${testCase.name}`, async () => {
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
const images = convertPdfToImages(testCase.pdfPath);
console.log(` Pages: ${images.length}`);
const extracted = await extractTransactions(images);
console.log(` Extracted: ${extracted.length} transactions`);
const result = compareTransactions(extracted, expected);
const accuracy = result.total > 0 ? result.matches / result.total : 0;
if (accuracy >= 0.95 && extracted.length === expected.length) {
passedCount++;
console.log(` Result: PASS (${result.matches}/${result.total})`);
} else {
failedCount++;
console.log(` Result: FAIL (${result.matches}/${result.total})`);
result.errors.slice(0, 5).forEach((e) => console.log(` - ${e}`));
}
expect(accuracy).toBeGreaterThan(0.95);
expect(extracted.length).toEqual(expected.length);
});
}
tap.test('summary', async () => {
const total = testCases.length;
console.log(`\n======================================================`);
console.log(` Bank Statement Summary (Qwen3-VL Vision)`);
console.log(`======================================================`);
console.log(` Method: Multi-query (count then extract each)`);
console.log(` Passed: ${passedCount}/${total}`);
console.log(` Failed: ${failedCount}/${total}`);
console.log(`======================================================\n`);
});
export default tap.start();