Files
ht-docker-ai/test/test.bankstatements.minicpm.ts

537 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Bank statement extraction using MiniCPM-V (visual extraction)
*
* JSON per-page approach:
* 1. Ask for structured JSON of all transactions per page
* 2. Consensus: extract twice, compare, retry if mismatch
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensureMiniCpm } from './helpers/docker.js';
const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
const JSON_PROMPT = `Extract ALL transactions from this bank statement page as a JSON array.
IMPORTANT RULES:
1. Each transaction has: date, description/counterparty, and an amount
2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals)
3. Amount is POSITIVE for money coming IN (credits, deposits, refunds)
4. Date format: YYYY-MM-DD
5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows
6. Only include actual transactions with a specific date and amount
Return ONLY this JSON format, no explanation:
[
{"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99},
{"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00}
]`;
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Query for JSON extraction
*/
async function queryJson(image: string, queryId: string): Promise<string> {
console.log(` [${queryId}] Sending request to ${MODEL}...`);
const startTime = Date.now();
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: MODEL,
messages: [{
role: 'user',
content: JSON_PROMPT,
images: [image],
}],
stream: false,
options: {
num_predict: 4000,
temperature: 0.1,
},
}),
});
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
if (!response.ok) {
console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`);
throw new Error(`Ollama API error: ${response.status}`);
}
const data = await response.json();
const content = (data.message?.content || '').trim();
console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`);
return content;
}
/**
* Sanitize JSON string - fix common issues from vision model output
*/
function sanitizeJson(jsonStr: string): string {
let s = jsonStr;
// Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix
// Handle various whitespace patterns
s = s.replace(/"amount"\s*:\s*\+/g, '"amount": ');
s = s.replace(/:\s*\+(\d)/g, ': $1');
// Fix European number format with thousands separator (e.g., 1.000.00 -> 1000.00)
// Pattern: "amount": X.XXX.XX where X.XXX is thousands and .XX is decimal
s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4');
// Also handle larger numbers like 10.000.00
s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3$4.$5');
// Fix trailing commas before ] or }
s = s.replace(/,\s*([}\]])/g, '$1');
// Fix unescaped newlines inside strings (replace with space)
s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"');
// Fix unescaped tabs inside strings
s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"');
// Fix unescaped backslashes (but not already escaped ones)
s = s.replace(/\\(?!["\\/bfnrtu])/g, '\\\\');
// Fix common issues with counterparty names containing special chars
s = s.replace(/"counterparty":\s*"([^"]*)'([^"]*)"/g, '"counterparty": "$1$2"');
// Remove control characters except newlines (which we handle above)
s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' ');
return s;
}
/**
* Parse JSON response into transactions
*/
function parseJsonResponse(response: string, queryId: string): ITransaction[] {
console.log(` [${queryId}] Parsing response...`);
// Try to find JSON in markdown code block
const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/);
let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim();
if (codeBlockMatch) {
console.log(` [${queryId}] Found JSON in code block`);
}
// Sanitize JSON (fix +number issue)
jsonStr = sanitizeJson(jsonStr);
try {
const parsed = JSON.parse(jsonStr);
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions (direct)`);
return txs;
}
console.log(` [${queryId}] Parsed JSON is not an array`);
} catch (e) {
const errMsg = (e as Error).message;
console.log(` [${queryId}] Direct parse failed: ${errMsg}`);
// Log problematic section with context
const posMatch = errMsg.match(/position (\d+)/);
if (posMatch) {
const pos = parseInt(posMatch[1]);
const start = Math.max(0, pos - 40);
const end = Math.min(jsonStr.length, pos + 40);
const context = jsonStr.substring(start, end);
const marker = ' '.repeat(pos - start) + '^';
console.log(` [${queryId}] Context around error position ${pos}:`);
console.log(` [${queryId}] ...${context}...`);
console.log(` [${queryId}] ${marker}`);
}
// Try to find JSON array pattern
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
if (arrayMatch) {
console.log(` [${queryId}] Found array pattern, trying to parse...`);
const sanitizedArray = sanitizeJson(arrayMatch[0]);
try {
const parsed = JSON.parse(sanitizedArray);
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`);
return txs;
}
} catch (e2) {
const errMsg2 = (e2 as Error).message;
console.log(` [${queryId}] Array parse failed: ${errMsg2}`);
const posMatch2 = errMsg2.match(/position (\d+)/);
if (posMatch2) {
const pos2 = parseInt(posMatch2[1]);
console.log(` [${queryId}] Context around error: ...${sanitizedArray.substring(Math.max(0, pos2 - 30), pos2 + 30)}...`);
}
// Try to extract individual objects from the malformed array
console.log(` [${queryId}] Attempting object-by-object extraction...`);
const extracted = extractTransactionsFromMalformedJson(sanitizedArray, queryId);
if (extracted.length > 0) {
console.log(` [${queryId}] Recovered ${extracted.length} transactions via object extraction`);
return extracted;
}
}
} else {
console.log(` [${queryId}] No array pattern found in response`);
console.log(` [${queryId}] Raw response preview: ${response.substring(0, 200)}...`);
}
}
console.log(` [${queryId}] PARSE FAILED - returning empty array`);
return [];
}
/**
* Extract transactions from malformed JSON by parsing objects individually
*/
function extractTransactionsFromMalformedJson(jsonStr: string, queryId: string): ITransaction[] {
const transactions: ITransaction[] = [];
// Match individual transaction objects
const objectPattern = /\{\s*"date"\s*:\s*"([^"]+)"\s*,\s*"counterparty"\s*:\s*"([^"]+)"\s*,\s*"amount"\s*:\s*([+-]?\d+\.?\d*)\s*\}/g;
let match;
while ((match = objectPattern.exec(jsonStr)) !== null) {
transactions.push({
date: match[1],
counterparty: match[2],
amount: parseFloat(match[3]),
});
}
// Also try with different field orders (amount before counterparty, etc.)
if (transactions.length === 0) {
const altPattern = /\{\s*"date"\s*:\s*"([^"]+)"[^}]*"amount"\s*:\s*([+-]?\d+\.?\d*)[^}]*\}/g;
while ((match = altPattern.exec(jsonStr)) !== null) {
// Try to extract counterparty from the match
const counterpartyMatch = match[0].match(/"counterparty"\s*:\s*"([^"]+)"/);
const descMatch = match[0].match(/"description"\s*:\s*"([^"]+)"/);
transactions.push({
date: match[1],
counterparty: counterpartyMatch?.[1] || descMatch?.[1] || 'UNKNOWN',
amount: parseFloat(match[2]),
});
}
}
return transactions;
}
/**
* Parse amount from various formats
*/
function parseAmount(value: unknown): number {
if (typeof value === 'number') return value;
if (typeof value !== 'string') return 0;
let s = value.replace(/[€$£\s]/g, '').replace('', '-').replace('', '-');
// European format: comma is decimal
if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) {
s = s.replace(/\./g, '').replace(',', '.');
} else {
s = s.replace(/,/g, '');
}
return parseFloat(s) || 0;
}
/**
* Compare two transaction arrays for consensus
*/
function transactionArraysMatch(a: ITransaction[], b: ITransaction[]): boolean {
if (a.length !== b.length) return false;
for (let i = 0; i < a.length; i++) {
const dateMatch = a[i].date === b[i].date;
const amountMatch = Math.abs(a[i].amount - b[i].amount) < 0.01;
if (!dateMatch || !amountMatch) return false;
}
return true;
}
/**
* Compare two transaction arrays and log differences
*/
function compareAndLogDifferences(txs1: ITransaction[], txs2: ITransaction[], pageNum: number): void {
if (txs1.length !== txs2.length) {
console.log(` [Page ${pageNum}] Length mismatch: Q1=${txs1.length}, Q2=${txs2.length}`);
return;
}
for (let i = 0; i < txs1.length; i++) {
const dateMatch = txs1[i].date === txs2[i].date;
const amountMatch = Math.abs(txs1[i].amount - txs2[i].amount) < 0.01;
if (!dateMatch || !amountMatch) {
console.log(` [Page ${pageNum}] Tx ${i + 1} differs:`);
console.log(` Q1: ${txs1[i].date} | ${txs1[i].amount}`);
console.log(` Q2: ${txs2[i].date} | ${txs2[i].amount}`);
}
}
}
/**
* Extract transactions from a single page with consensus
*/
async function extractTransactionsFromPage(image: string, pageNum: number): Promise<ITransaction[]> {
const MAX_ATTEMPTS = 5;
console.log(`\n ======== Page ${pageNum} ========`);
console.log(` [Page ${pageNum}] Starting JSON extraction...`);
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
console.log(`\n [Page ${pageNum}] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`);
// Extract twice in parallel
const q1Id = `P${pageNum}A${attempt}Q1`;
const q2Id = `P${pageNum}A${attempt}Q2`;
const [response1, response2] = await Promise.all([
queryJson(image, q1Id),
queryJson(image, q2Id),
]);
const txs1 = parseJsonResponse(response1, q1Id);
const txs2 = parseJsonResponse(response2, q2Id);
console.log(` [Page ${pageNum}] Results: Q1=${txs1.length} txs, Q2=${txs2.length} txs`);
if (txs1.length > 0 && transactionArraysMatch(txs1, txs2)) {
console.log(` [Page ${pageNum}] ✓ CONSENSUS REACHED: ${txs1.length} transactions`);
console.log(` [Page ${pageNum}] Transactions:`);
for (let i = 0; i < txs1.length; i++) {
const tx = txs1[i];
console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`);
}
return txs1;
}
console.log(` [Page ${pageNum}] ✗ NO CONSENSUS`);
compareAndLogDifferences(txs1, txs2, pageNum);
if (attempt < MAX_ATTEMPTS) {
console.log(` [Page ${pageNum}] Retrying...`);
}
}
// Fallback: use last response
console.log(`\n [Page ${pageNum}] === FALLBACK (no consensus after ${MAX_ATTEMPTS} attempts) ===`);
const fallbackId = `P${pageNum}FALLBACK`;
const fallbackResponse = await queryJson(image, fallbackId);
const fallback = parseJsonResponse(fallbackResponse, fallbackId);
console.log(` [Page ${pageNum}] ~ FALLBACK RESULT: ${fallback.length} transactions`);
for (let i = 0; i < fallback.length; i++) {
const tx = fallback[i];
console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`);
}
return fallback;
}
/**
* Extract all transactions from bank statement
*/
async function extractTransactions(images: string[]): Promise<ITransaction[]> {
console.log(` [Vision] Processing ${images.length} page(s) with ${MODEL} (JSON consensus)`);
const allTransactions: ITransaction[] = [];
for (let i = 0; i < images.length; i++) {
const pageTransactions = await extractTransactionsFromPage(images[i], i + 1);
allTransactions.push(...pageTransactions);
}
console.log(` [Vision] Total: ${allTransactions.length} transactions`);
return allTransactions;
}
/**
* Compare extracted transactions against expected
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[]; variations: string[] } {
const errors: string[] = [];
const variations: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
// Track counterparty variations (date and amount match but name differs)
if (ext.counterparty !== exp.counterparty) {
variations.push(
`[${i}] "${exp.counterparty}" → "${ext.counterparty}"`
);
}
} else {
errors.push(
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors, variations };
}
/**
* Find all test cases (PDF + JSON pairs) in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) {
return [];
}
const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases.sort((a, b) => a.name.localeCompare(b.name));
}
// Tests
tap.test('setup: ensure Docker containers are running', async () => {
console.log('\n[Setup] Checking Docker containers...\n');
const minicpmOk = await ensureMiniCpm();
expect(minicpmOk).toBeTrue();
console.log('\n[Setup] All containers ready!\n');
});
tap.test('should have MiniCPM-V model loaded', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
const data = await response.json();
const modelNames = data.models.map((m: { name: string }) => m.name);
expect(modelNames.some((name: string) => name.includes('minicpm'))).toBeTrue();
});
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V)\n`);
let passedCount = 0;
let failedCount = 0;
for (const testCase of testCases) {
tap.test(`should extract: ${testCase.name}`, async () => {
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
const images = convertPdfToImages(testCase.pdfPath);
console.log(` Pages: ${images.length}`);
const extracted = await extractTransactions(images);
console.log(` Extracted: ${extracted.length} transactions`);
const result = compareTransactions(extracted, expected);
const perfectMatch = result.matches === result.total && extracted.length === expected.length;
if (perfectMatch) {
passedCount++;
console.log(` Result: PASS (${result.matches}/${result.total})`);
} else {
failedCount++;
console.log(` Result: FAIL (${result.matches}/${result.total})`);
result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`));
}
// Log counterparty variations (names that differ but date/amount matched)
if (result.variations.length > 0) {
console.log(` Counterparty variations (${result.variations.length}):`);
result.variations.forEach((v) => console.log(` ${v}`));
}
expect(result.matches).toEqual(result.total);
expect(extracted.length).toEqual(expected.length);
});
}
tap.test('summary', async () => {
const total = testCases.length;
console.log(`\n======================================================`);
console.log(` Bank Statement Summary (${MODEL})`);
console.log(`======================================================`);
console.log(` Method: JSON per-page + consensus`);
console.log(` Passed: ${passedCount}/${total}`);
console.log(` Failed: ${failedCount}/${total}`);
console.log(`======================================================\n`);
});
export default tap.start();