Files
ht-docker-ai/test/test.bankstatements.minicpm.ts
Juergen Kunz d384c1d79b feat(tests): integrate smartagent DualAgentOrchestrator with streaming support
- Update test.invoices.nanonets.ts to use DualAgentOrchestrator for JSON extraction
- Enable streaming token callback for real-time progress visibility
- Add markdown caching to avoid re-running Nanonets OCR for cached files
- Update test.bankstatements.minicpm.ts and test.invoices.minicpm.ts with streaming
- Update dependencies to @push.rocks/smartai@0.11.1 and @push.rocks/smartagent@1.2.8
2026-01-20 00:39:36 +00:00

489 lines
16 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Bank statement extraction using MiniCPM-V (visual extraction)
*
* JSON per-page approach with streaming output:
* 1. Ask for structured JSON of all transactions per page
* 2. Single pass extraction (no consensus)
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensureMiniCpm } from './helpers/docker.js';
const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
const JSON_PROMPT = `Extract ALL transactions from this bank statement page as a JSON array.
IMPORTANT RULES:
1. Each transaction has: date, description/counterparty, and an amount
2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals)
3. Amount is POSITIVE for money coming IN (credits, deposits, refunds)
4. Date format: YYYY-MM-DD
5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows
6. Only include actual transactions with a specific date and amount
Return ONLY this JSON format, no explanation:
[
{"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99},
{"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00}
]`;
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Query for JSON extraction with streaming output
*/
async function queryJson(image: string, queryId: string): Promise<string> {
const startTime = Date.now();
process.stdout.write(` [${queryId}] `);
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: MODEL,
messages: [{
role: 'user',
content: JSON_PROMPT,
images: [image],
}],
stream: true,
options: {
num_ctx: 32768,
num_predict: 4000,
temperature: 0.1,
},
}),
});
if (!response.ok) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
process.stdout.write(`ERROR: ${response.status} (${elapsed}s)\n`);
throw new Error(`Ollama API error: ${response.status}`);
}
let content = '';
const reader = response.body!.getReader();
const decoder = new TextDecoder();
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
for (const line of chunk.split('\n').filter(l => l.trim())) {
try {
const json = JSON.parse(line);
const token = json.message?.content || '';
if (token) {
process.stdout.write(token);
content += token;
}
} catch {
// Ignore parse errors for partial chunks
}
}
}
} finally {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
process.stdout.write(` (${elapsed}s)\n`);
}
return content.trim();
}
/**
* Sanitize JSON string - fix common issues from vision model output
*/
function sanitizeJson(jsonStr: string): string {
let s = jsonStr;
// Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix
// Handle various whitespace patterns
s = s.replace(/"amount"\s*:\s*\+/g, '"amount": ');
s = s.replace(/:\s*\+(\d)/g, ': $1');
// Fix European number format with thousands separator (e.g., 1.000.00 -> 1000.00)
// Pattern: "amount": X.XXX.XX where X.XXX is thousands and .XX is decimal
s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4');
// Also handle larger numbers like 10.000.00
s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3$4.$5');
// Fix trailing commas before ] or }
s = s.replace(/,\s*([}\]])/g, '$1');
// Fix unescaped newlines inside strings (replace with space)
s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"');
// Fix unescaped tabs inside strings
s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"');
// Fix unescaped backslashes (but not already escaped ones)
s = s.replace(/\\(?!["\\/bfnrtu])/g, '\\\\');
// Fix common issues with counterparty names containing special chars
s = s.replace(/"counterparty":\s*"([^"]*)'([^"]*)"/g, '"counterparty": "$1$2"');
// Remove control characters except newlines (which we handle above)
s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' ');
return s;
}
/**
* Parse JSON response into transactions
*/
function parseJsonResponse(response: string, queryId: string): ITransaction[] {
console.log(` [${queryId}] Parsing response...`);
// Try to find JSON in markdown code block
const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/);
let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim();
if (codeBlockMatch) {
console.log(` [${queryId}] Found JSON in code block`);
}
// Sanitize JSON (fix +number issue)
jsonStr = sanitizeJson(jsonStr);
try {
const parsed = JSON.parse(jsonStr);
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions (direct)`);
return txs;
}
console.log(` [${queryId}] Parsed JSON is not an array`);
} catch (e) {
const errMsg = (e as Error).message;
console.log(` [${queryId}] Direct parse failed: ${errMsg}`);
// Log problematic section with context
const posMatch = errMsg.match(/position (\d+)/);
if (posMatch) {
const pos = parseInt(posMatch[1]);
const start = Math.max(0, pos - 40);
const end = Math.min(jsonStr.length, pos + 40);
const context = jsonStr.substring(start, end);
const marker = ' '.repeat(pos - start) + '^';
console.log(` [${queryId}] Context around error position ${pos}:`);
console.log(` [${queryId}] ...${context}...`);
console.log(` [${queryId}] ${marker}`);
}
// Try to find JSON array pattern
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
if (arrayMatch) {
console.log(` [${queryId}] Found array pattern, trying to parse...`);
const sanitizedArray = sanitizeJson(arrayMatch[0]);
try {
const parsed = JSON.parse(sanitizedArray);
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`);
return txs;
}
} catch (e2) {
const errMsg2 = (e2 as Error).message;
console.log(` [${queryId}] Array parse failed: ${errMsg2}`);
const posMatch2 = errMsg2.match(/position (\d+)/);
if (posMatch2) {
const pos2 = parseInt(posMatch2[1]);
console.log(` [${queryId}] Context around error: ...${sanitizedArray.substring(Math.max(0, pos2 - 30), pos2 + 30)}...`);
}
// Try to extract individual objects from the malformed array
console.log(` [${queryId}] Attempting object-by-object extraction...`);
const extracted = extractTransactionsFromMalformedJson(sanitizedArray, queryId);
if (extracted.length > 0) {
console.log(` [${queryId}] Recovered ${extracted.length} transactions via object extraction`);
return extracted;
}
}
} else {
console.log(` [${queryId}] No array pattern found in response`);
console.log(` [${queryId}] Raw response preview: ${response.substring(0, 200)}...`);
}
}
console.log(` [${queryId}] PARSE FAILED - returning empty array`);
return [];
}
/**
* Extract transactions from malformed JSON by parsing objects individually
*/
function extractTransactionsFromMalformedJson(jsonStr: string, queryId: string): ITransaction[] {
const transactions: ITransaction[] = [];
// Match individual transaction objects
const objectPattern = /\{\s*"date"\s*:\s*"([^"]+)"\s*,\s*"counterparty"\s*:\s*"([^"]+)"\s*,\s*"amount"\s*:\s*([+-]?\d+\.?\d*)\s*\}/g;
let match;
while ((match = objectPattern.exec(jsonStr)) !== null) {
transactions.push({
date: match[1],
counterparty: match[2],
amount: parseFloat(match[3]),
});
}
// Also try with different field orders (amount before counterparty, etc.)
if (transactions.length === 0) {
const altPattern = /\{\s*"date"\s*:\s*"([^"]+)"[^}]*"amount"\s*:\s*([+-]?\d+\.?\d*)[^}]*\}/g;
while ((match = altPattern.exec(jsonStr)) !== null) {
// Try to extract counterparty from the match
const counterpartyMatch = match[0].match(/"counterparty"\s*:\s*"([^"]+)"/);
const descMatch = match[0].match(/"description"\s*:\s*"([^"]+)"/);
transactions.push({
date: match[1],
counterparty: counterpartyMatch?.[1] || descMatch?.[1] || 'UNKNOWN',
amount: parseFloat(match[2]),
});
}
}
return transactions;
}
/**
* Parse amount from various formats
*/
function parseAmount(value: unknown): number {
if (typeof value === 'number') return value;
if (typeof value !== 'string') return 0;
let s = value.replace(/[€$£\s]/g, '').replace('', '-').replace('', '-');
// European format: comma is decimal
if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) {
s = s.replace(/\./g, '').replace(',', '.');
} else {
s = s.replace(/,/g, '');
}
return parseFloat(s) || 0;
}
/**
* Extract transactions from a single page (single pass)
*/
async function extractTransactionsFromPage(image: string, pageNum: number): Promise<ITransaction[]> {
console.log(`\n ======== Page ${pageNum} ========`);
const queryId = `P${pageNum}`;
const response = await queryJson(image, queryId);
const transactions = parseJsonResponse(response, queryId);
console.log(` [Page ${pageNum}] Extracted ${transactions.length} transactions:`);
for (let i = 0; i < transactions.length; i++) {
const tx = transactions[i];
console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`);
}
return transactions;
}
/**
* Extract all transactions from bank statement
*/
async function extractTransactions(images: string[]): Promise<ITransaction[]> {
console.log(` [Vision] Processing ${images.length} page(s) with ${MODEL} (single pass)`);
const allTransactions: ITransaction[] = [];
for (let i = 0; i < images.length; i++) {
const pageTransactions = await extractTransactionsFromPage(images[i], i + 1);
allTransactions.push(...pageTransactions);
}
console.log(` [Vision] Total: ${allTransactions.length} transactions`);
return allTransactions;
}
/**
* Compare extracted transactions against expected
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[]; variations: string[] } {
const errors: string[] = [];
const variations: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
// Track counterparty variations (date and amount match but name differs)
if (ext.counterparty !== exp.counterparty) {
variations.push(
`[${i}] "${exp.counterparty}" → "${ext.counterparty}"`
);
}
} else {
errors.push(
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors, variations };
}
/**
* Find all test cases (PDF + JSON pairs) in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) {
return [];
}
const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases.sort((a, b) => a.name.localeCompare(b.name));
}
// Tests
tap.test('setup: ensure Docker containers are running', async () => {
console.log('\n[Setup] Checking Docker containers...\n');
const minicpmOk = await ensureMiniCpm();
expect(minicpmOk).toBeTrue();
console.log('\n[Setup] All containers ready!\n');
});
tap.test('should have MiniCPM-V model loaded', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
const data = await response.json();
const modelNames = data.models.map((m: { name: string }) => m.name);
expect(modelNames.some((name: string) => name.includes('minicpm'))).toBeTrue();
});
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V)\n`);
let passedCount = 0;
let failedCount = 0;
for (const testCase of testCases) {
tap.test(`should extract: ${testCase.name}`, async () => {
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
const images = convertPdfToImages(testCase.pdfPath);
console.log(` Pages: ${images.length}`);
const extracted = await extractTransactions(images);
console.log(` Extracted: ${extracted.length} transactions`);
const result = compareTransactions(extracted, expected);
const perfectMatch = result.matches === result.total && extracted.length === expected.length;
if (perfectMatch) {
passedCount++;
console.log(` Result: PASS (${result.matches}/${result.total})`);
} else {
failedCount++;
console.log(` Result: FAIL (${result.matches}/${result.total})`);
result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`));
}
// Log counterparty variations (names that differ but date/amount matched)
if (result.variations.length > 0) {
console.log(` Counterparty variations (${result.variations.length}):`);
result.variations.forEach((v) => console.log(` ${v}`));
}
expect(result.matches).toEqual(result.total);
expect(extracted.length).toEqual(expected.length);
});
}
tap.test('summary', async () => {
const total = testCases.length;
console.log(`\n======================================================`);
console.log(` Bank Statement Summary (${MODEL})`);
console.log(`======================================================`);
console.log(` Method: JSON per-page (single pass)`);
console.log(` Passed: ${passedCount}/${total}`);
console.log(` Failed: ${failedCount}/${total}`);
console.log(`======================================================\n`);
});
export default tap.start();