335 lines
9.4 KiB
TypeScript
335 lines
9.4 KiB
TypeScript
/**
|
|
* Bank statement extraction test using MiniCPM-V only (visual extraction)
|
|
*
|
|
* This tests MiniCPM-V's ability to extract bank transactions directly from images
|
|
* without any OCR augmentation.
|
|
*/
|
|
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { execSync } from 'child_process';
|
|
import * as os from 'os';
|
|
import { ensureMiniCpm } from './helpers/docker.js';
|
|
|
|
// Service URL
|
|
const OLLAMA_URL = 'http://localhost:11434';
|
|
|
|
// Model
|
|
const MINICPM_MODEL = 'minicpm-v:latest';
|
|
|
|
// Prompt for MiniCPM-V visual extraction
|
|
const MINICPM_EXTRACT_PROMPT = `/nothink
|
|
You are a bank statement parser. Extract EVERY transaction from the table.
|
|
|
|
Read the Amount column carefully:
|
|
- "- 21,47 €" means DEBIT, output as: -21.47
|
|
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
|
- European format: comma = decimal point
|
|
|
|
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
|
|
|
Do not skip any rows. Return ONLY the JSON array, no explanation.`;
|
|
|
|
interface ITransaction {
|
|
date: string;
|
|
counterparty: string;
|
|
amount: number;
|
|
}
|
|
|
|
/**
|
|
* Convert PDF to PNG images using ImageMagick
|
|
*/
|
|
function convertPdfToImages(pdfPath: string): string[] {
|
|
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
|
|
const outputPattern = path.join(tempDir, 'page-%d.png');
|
|
|
|
try {
|
|
execSync(
|
|
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
|
|
{ stdio: 'pipe' }
|
|
);
|
|
|
|
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
|
|
const images: string[] = [];
|
|
|
|
for (const file of files) {
|
|
const imagePath = path.join(tempDir, file);
|
|
const imageData = fs.readFileSync(imagePath);
|
|
images.push(imageData.toString('base64'));
|
|
}
|
|
|
|
return images;
|
|
} finally {
|
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract using MiniCPM-V via Ollama
|
|
*/
|
|
async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
|
|
const payload = {
|
|
model: MINICPM_MODEL,
|
|
prompt: MINICPM_EXTRACT_PROMPT,
|
|
images,
|
|
stream: true,
|
|
options: {
|
|
num_predict: 16384,
|
|
temperature: 0.1,
|
|
},
|
|
};
|
|
|
|
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(payload),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Ollama API error: ${response.status}`);
|
|
}
|
|
|
|
const reader = response.body?.getReader();
|
|
if (!reader) {
|
|
throw new Error('No response body');
|
|
}
|
|
|
|
const decoder = new TextDecoder();
|
|
let fullText = '';
|
|
let lineBuffer = '';
|
|
|
|
console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
|
|
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
|
|
const chunk = decoder.decode(value, { stream: true });
|
|
const lines = chunk.split('\n').filter((l) => l.trim());
|
|
|
|
for (const line of lines) {
|
|
try {
|
|
const json = JSON.parse(line);
|
|
if (json.response) {
|
|
fullText += json.response;
|
|
lineBuffer += json.response;
|
|
|
|
if (lineBuffer.includes('\n')) {
|
|
const parts = lineBuffer.split('\n');
|
|
for (let i = 0; i < parts.length - 1; i++) {
|
|
console.log(parts[i]);
|
|
}
|
|
lineBuffer = parts[parts.length - 1];
|
|
}
|
|
}
|
|
} catch {
|
|
// Skip invalid JSON lines
|
|
}
|
|
}
|
|
}
|
|
|
|
if (lineBuffer) {
|
|
console.log(lineBuffer);
|
|
}
|
|
console.log('');
|
|
|
|
const startIdx = fullText.indexOf('[');
|
|
const endIdx = fullText.lastIndexOf(']') + 1;
|
|
|
|
if (startIdx < 0 || endIdx <= startIdx) {
|
|
throw new Error('No JSON array found in response');
|
|
}
|
|
|
|
return JSON.parse(fullText.substring(startIdx, endIdx));
|
|
}
|
|
|
|
/**
|
|
* Create a hash of transactions for comparison
|
|
*/
|
|
function hashTransactions(transactions: ITransaction[]): string {
|
|
return transactions
|
|
.map((t) => `${t.date}|${t.amount.toFixed(2)}`)
|
|
.sort()
|
|
.join(';');
|
|
}
|
|
|
|
/**
|
|
* Extract with consensus voting using MiniCPM-V only
|
|
*/
|
|
async function extractWithConsensus(
|
|
images: string[],
|
|
maxPasses: number = 5
|
|
): Promise<ITransaction[]> {
|
|
const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
|
|
const hashCounts: Map<string, number> = new Map();
|
|
|
|
const addResult = (transactions: ITransaction[], passLabel: string): number => {
|
|
const hash = hashTransactions(transactions);
|
|
results.push({ transactions, hash });
|
|
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
|
|
console.log(
|
|
`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
|
|
);
|
|
return hashCounts.get(hash)!;
|
|
};
|
|
|
|
console.log('[Setup] Using MiniCPM-V only');
|
|
|
|
for (let pass = 1; pass <= maxPasses; pass++) {
|
|
try {
|
|
const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
|
|
const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
|
|
|
|
if (count >= 2) {
|
|
console.log(`[Consensus] Reached after ${pass} passes`);
|
|
return transactions;
|
|
}
|
|
|
|
console.log(`[Pass ${pass}] No consensus yet, trying again...`);
|
|
} catch (err) {
|
|
console.log(`[Pass ${pass}] Error: ${err}`);
|
|
}
|
|
}
|
|
|
|
// No consensus reached - return the most common result
|
|
let bestHash = '';
|
|
let bestCount = 0;
|
|
for (const [hash, count] of hashCounts) {
|
|
if (count > bestCount) {
|
|
bestCount = count;
|
|
bestHash = hash;
|
|
}
|
|
}
|
|
|
|
if (!bestHash) {
|
|
throw new Error('No valid results obtained');
|
|
}
|
|
|
|
const best = results.find((r) => r.hash === bestHash)!;
|
|
console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
|
|
return best.transactions;
|
|
}
|
|
|
|
/**
|
|
* Compare extracted transactions against expected
|
|
*/
|
|
function compareTransactions(
|
|
extracted: ITransaction[],
|
|
expected: ITransaction[]
|
|
): { matches: number; total: number; errors: string[] } {
|
|
const errors: string[] = [];
|
|
let matches = 0;
|
|
|
|
for (let i = 0; i < expected.length; i++) {
|
|
const exp = expected[i];
|
|
const ext = extracted[i];
|
|
|
|
if (!ext) {
|
|
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
|
|
continue;
|
|
}
|
|
|
|
const dateMatch = ext.date === exp.date;
|
|
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
|
|
|
|
if (dateMatch && amountMatch) {
|
|
matches++;
|
|
} else {
|
|
errors.push(
|
|
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
|
|
);
|
|
}
|
|
}
|
|
|
|
if (extracted.length > expected.length) {
|
|
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
|
|
}
|
|
|
|
return { matches, total: expected.length, errors };
|
|
}
|
|
|
|
/**
|
|
* Find all test cases (PDF + JSON pairs) in .nogit/
|
|
*/
|
|
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
|
|
const testDir = path.join(process.cwd(), '.nogit');
|
|
if (!fs.existsSync(testDir)) {
|
|
return [];
|
|
}
|
|
|
|
const files = fs.readdirSync(testDir);
|
|
const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
|
|
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
|
|
|
|
for (const pdf of pdfFiles) {
|
|
const baseName = pdf.replace('.pdf', '');
|
|
const jsonFile = `${baseName}.json`;
|
|
if (files.includes(jsonFile)) {
|
|
testCases.push({
|
|
name: baseName,
|
|
pdfPath: path.join(testDir, pdf),
|
|
jsonPath: path.join(testDir, jsonFile),
|
|
});
|
|
}
|
|
}
|
|
|
|
return testCases;
|
|
}
|
|
|
|
// Tests
|
|
|
|
tap.test('setup: ensure Docker containers are running', async () => {
|
|
console.log('\n[Setup] Checking Docker containers...\n');
|
|
|
|
// Ensure MiniCPM is running
|
|
const minicpmOk = await ensureMiniCpm();
|
|
expect(minicpmOk).toBeTrue();
|
|
|
|
console.log('\n[Setup] All containers ready!\n');
|
|
});
|
|
|
|
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
|
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
|
const data = await response.json();
|
|
const modelNames = data.models.map((m: { name: string }) => m.name);
|
|
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
|
|
});
|
|
|
|
// Dynamic test for each PDF/JSON pair
|
|
const testCases = findTestCases();
|
|
console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V only)\n`);
|
|
|
|
for (const testCase of testCases) {
|
|
tap.test(`should extract transactions from ${testCase.name}`, async () => {
|
|
// Load expected transactions
|
|
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
|
|
console.log(`\n=== ${testCase.name} ===`);
|
|
console.log(`Expected: ${expected.length} transactions`);
|
|
|
|
// Convert PDF to images
|
|
console.log('Converting PDF to images...');
|
|
const images = convertPdfToImages(testCase.pdfPath);
|
|
console.log(`Converted: ${images.length} pages\n`);
|
|
|
|
// Extract with consensus (MiniCPM-V only)
|
|
const extracted = await extractWithConsensus(images);
|
|
console.log(`\nFinal: ${extracted.length} transactions`);
|
|
|
|
// Compare results
|
|
const result = compareTransactions(extracted, expected);
|
|
console.log(`Accuracy: ${result.matches}/${result.total}`);
|
|
|
|
if (result.errors.length > 0) {
|
|
console.log('Errors:');
|
|
result.errors.forEach((e) => console.log(` - ${e}`));
|
|
}
|
|
|
|
// Assert high accuracy
|
|
const accuracy = result.matches / result.total;
|
|
expect(accuracy).toBeGreaterThan(0.95);
|
|
expect(extracted.length).toEqual(expected.length);
|
|
});
|
|
}
|
|
|
|
export default tap.start();
|