/** * Bank statement extraction test using PaddleOCR-VL Full Pipeline * * This tests the complete PaddleOCR-VL pipeline for bank statements: * 1. PP-DocLayoutV2 for layout detection * 2. PaddleOCR-VL for recognition (tables with proper structure) * 3. Structured Markdown output with tables * 4. MiniCPM extracts transactions from structured tables * * The structured Markdown has properly formatted tables, * making it much easier for MiniCPM to extract transaction data. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; const OLLAMA_URL = 'http://localhost:11434'; const MINICPM_MODEL = 'minicpm-v:latest'; interface ITransaction { date: string; counterparty: string; amount: number; } /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown) */ async function parseDocument(imageBase64: string): Promise { const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ image: imageBase64, output_format: 'markdown', }), }); if (!response.ok) { const text = await response.text(); throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); } const data = await response.json(); if (!data.success) { throw new Error(`PaddleOCR-VL error: ${data.error}`); } return data.result?.markdown || ''; } /** * Extract transactions from structured Markdown using MiniCPM */ async function extractTransactionsFromMarkdown(markdown: string): Promise { console.log(` [Extract] Processing ${markdown.length} chars of Markdown`); const prompt = `/nothink Convert this bank statement to a JSON array of transactions. Read the Amount values carefully: - "- 21,47 €" means DEBIT, output as: -21.47 - "+ 1.000,00 €" means CREDIT, output as: 1000.00 - European format: comma = decimal point, dot = thousands For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} Return ONLY the JSON array, no explanation. Document: ${markdown}`; const payload = { model: MINICPM_MODEL, prompt, stream: true, options: { num_predict: 16384, temperature: 0.1, }, }; const response = await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.response) { fullText += json.response; } } catch { // Skip invalid JSON lines } } } // Extract JSON array from response const startIdx = fullText.indexOf('['); const endIdx = fullText.lastIndexOf(']') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error(`No JSON array found in response: ${fullText.substring(0, 200)}`); } const jsonStr = fullText.substring(startIdx, endIdx); return JSON.parse(jsonStr); } /** * Extract transactions from all pages of a bank statement */ async function extractAllTransactions(images: string[]): Promise { const allTransactions: ITransaction[] = []; for (let i = 0; i < images.length; i++) { console.log(` Processing page ${i + 1}/${images.length}...`); // Parse with full pipeline const markdown = await parseDocument(images[i]); console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); // Extract transactions try { const transactions = await extractTransactionsFromMarkdown(markdown); console.log(` [Extracted] ${transactions.length} transactions`); allTransactions.push(...transactions); } catch (err) { console.log(` [Error] ${err}`); } } return allTransactions; } /** * Compare transactions - find matching transaction in expected list */ function findMatchingTransaction( tx: ITransaction, expectedList: ITransaction[] ): ITransaction | undefined { return expectedList.find((exp) => { const dateMatch = tx.date === exp.date; const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; const counterpartyMatch = tx.counterparty?.toLowerCase().includes(exp.counterparty?.toLowerCase().slice(0, 10)) || exp.counterparty?.toLowerCase().includes(tx.counterparty?.toLowerCase().slice(0, 10)); return dateMatch && amountMatch && counterpartyMatch; }); } /** * Calculate extraction accuracy */ function calculateAccuracy( extracted: ITransaction[], expected: ITransaction[] ): { matched: number; total: number; accuracy: number } { let matched = 0; const usedExpected = new Set(); for (const tx of extracted) { for (let i = 0; i < expected.length; i++) { if (usedExpected.has(i)) continue; const exp = expected[i]; const dateMatch = tx.date === exp.date; const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; if (dateMatch && amountMatch) { matched++; usedExpected.add(i); break; } } } return { matched, total: expected.length, accuracy: expected.length > 0 ? (matched / expected.length) * 100 : 0, }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/bankstatements/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit/bankstatements'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } testCases.sort((a, b) => a.name.localeCompare(b.name)); return testCases; } // Tests tap.test('setup: ensure Docker containers are running', async () => { console.log('\n[Setup] Checking Docker containers...\n'); // Ensure PaddleOCR-VL Full Pipeline is running const paddleOk = await ensurePaddleOcrVlFull(); expect(paddleOk).toBeTrue(); // Ensure MiniCPM is running (for field extraction from Markdown) const minicpmOk = await ensureMiniCpm(); expect(minicpmOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); // Dynamic test for each PDF/JSON pair const testCases = findTestCases(); console.log(`\nFound ${testCases.length} bank statement test cases (PaddleOCR-VL Full Pipeline)\n`); const results: Array<{ name: string; accuracy: number; matched: number; total: number }> = []; for (const testCase of testCases) { tap.test(`should extract bank statement: ${testCase.name}`, async () => { // Load expected data const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.length} transactions`); const startTime = Date.now(); // Convert PDF to images const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); // Extract all transactions const extracted = await extractAllTransactions(images); const endTime = Date.now(); const elapsedMs = endTime - startTime; // Calculate accuracy const accuracy = calculateAccuracy(extracted, expected); results.push({ name: testCase.name, accuracy: accuracy.accuracy, matched: accuracy.matched, total: accuracy.total, }); console.log(` Extracted: ${extracted.length} transactions`); console.log(` Matched: ${accuracy.matched}/${accuracy.total} (${accuracy.accuracy.toFixed(1)}%)`); console.log(` Time: ${(elapsedMs / 1000).toFixed(1)}s`); // We expect at least 50% accuracy expect(accuracy.accuracy).toBeGreaterThan(50); }); } tap.test('summary', async () => { const totalStatements = results.length; const avgAccuracy = results.length > 0 ? results.reduce((a, b) => a + b.accuracy, 0) / results.length : 0; const totalMatched = results.reduce((a, b) => a + b.matched, 0); const totalExpected = results.reduce((a, b) => a + b.total, 0); console.log(`\n======================================================`); console.log(` Bank Statement Extraction Summary (PaddleOCR-VL Full)`); console.log(`======================================================`); console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); console.log(` Statements: ${totalStatements}`); console.log(` Transactions: ${totalMatched}/${totalExpected} matched`); console.log(` Avg accuracy: ${avgAccuracy.toFixed(1)}%`); console.log(`======================================================\n`); }); export default tap.start();