import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; // Service URLs const OLLAMA_URL = 'http://localhost:11434'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; // Models const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0'; const PADDLEOCR_VL_MODEL = 'paddleocr-vl'; // Prompt for MiniCPM-V visual extraction const MINICPM_EXTRACT_PROMPT = `/nothink You are a bank statement parser. Extract EVERY transaction from the table. Read the Amount column carefully: - "- 21,47 €" means DEBIT, output as: -21.47 - "+ 1.000,00 €" means CREDIT, output as: 1000.00 - European format: comma = decimal point For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} Do not skip any rows. Return ONLY the JSON array, no explanation.`; // Prompt for PaddleOCR-VL table extraction const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`; // Post-processing prompt to convert PaddleOCR-VL output to JSON const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink Convert the following bank statement table data to JSON. Read the Amount values carefully: - "- 21,47 €" means DEBIT, output as: -21.47 - "+ 1.000,00 €" means CREDIT, output as: 1000.00 - European format: comma = decimal point For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} Return ONLY the JSON array, no explanation. Table data: --- {TABLE_DATA} ---`; interface ITransaction { date: string; counterparty: string; amount: number; } /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Extract using MiniCPM-V via Ollama */ async function extractWithMiniCPM(images: string[], passLabel: string): Promise { const payload = { model: MINICPM_MODEL, prompt: MINICPM_EXTRACT_PROMPT, images, stream: true, options: { num_predict: 16384, temperature: 0.1, }, }; const response = await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; let lineBuffer = ''; console.log(`[${passLabel}] Extracting with MiniCPM-V...`); while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.response) { fullText += json.response; lineBuffer += json.response; if (lineBuffer.includes('\n')) { const parts = lineBuffer.split('\n'); for (let i = 0; i < parts.length - 1; i++) { console.log(parts[i]); } lineBuffer = parts[parts.length - 1]; } } } catch { // Skip invalid JSON lines } } } if (lineBuffer) { console.log(lineBuffer); } console.log(''); const startIdx = fullText.indexOf('['); const endIdx = fullText.lastIndexOf(']') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error('No JSON array found in response'); } return JSON.parse(fullText.substring(startIdx, endIdx)); } /** * Extract table using PaddleOCR-VL via OpenAI-compatible API */ async function extractTableWithPaddleOCRVL(imageBase64: string): Promise { const payload = { model: PADDLEOCR_VL_MODEL, messages: [ { role: 'user', content: [ { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` }, }, { type: 'text', text: PADDLEOCR_VL_TABLE_PROMPT, }, ], }, ], temperature: 0.0, max_tokens: 8192, }; const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { const text = await response.text(); throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); } const data = await response.json(); return data.choices?.[0]?.message?.content || ''; } /** * Convert PaddleOCR-VL table output to transactions using MiniCPM-V */ async function convertTableToTransactions( tableData: string, passLabel: string ): Promise { const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData); const payload = { model: MINICPM_MODEL, prompt, stream: true, options: { num_predict: 16384, temperature: 0.1, }, }; const response = await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; console.log(`[${passLabel}] Converting table data to JSON...`); while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.response) { fullText += json.response; } } catch { // Skip invalid JSON lines } } } const startIdx = fullText.indexOf('['); const endIdx = fullText.lastIndexOf(']') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error('No JSON array found in response'); } return JSON.parse(fullText.substring(startIdx, endIdx)); } /** * Extract using PaddleOCR-VL (table recognition) + conversion */ async function extractWithPaddleOCRVL( images: string[], passLabel: string ): Promise { console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`); // Extract table data from each page const tableDataParts: string[] = []; for (let i = 0; i < images.length; i++) { console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`); const tableData = await extractTableWithPaddleOCRVL(images[i]); if (tableData.trim()) { tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`); } } const combinedTableData = tableDataParts.join('\n\n'); console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`); // Convert to transactions return convertTableToTransactions(combinedTableData, passLabel); } /** * Create a hash of transactions for comparison */ function hashTransactions(transactions: ITransaction[]): string { return transactions .map((t) => `${t.date}|${t.amount.toFixed(2)}`) .sort() .join(';'); } /** * Check if PaddleOCR-VL service is available */ async function isPaddleOCRVLAvailable(): Promise { try { const response = await fetch(`${PADDLEOCR_VL_URL}/health`, { method: 'GET', signal: AbortSignal.timeout(5000), }); return response.ok; } catch { return false; } } /** * Extract with dual-VLM consensus * Strategy: * Pass 1 = MiniCPM-V visual extraction * Pass 2 = PaddleOCR-VL table recognition (if available) * Pass 3+ = MiniCPM-V visual (fallback) */ async function extractWithConsensus( images: string[], maxPasses: number = 5 ): Promise { const results: Array<{ transactions: ITransaction[]; hash: string }> = []; const hashCounts: Map = new Map(); const addResult = (transactions: ITransaction[], passLabel: string): number => { const hash = hashTransactions(transactions); results.push({ transactions, hash }); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); console.log( `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)` ); return hashCounts.get(hash)!; }; // Check if PaddleOCR-VL is available const paddleOCRVLAvailable = await isPaddleOCRVLAvailable(); if (paddleOCRVLAvailable) { console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus'); } else { console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only'); } // Pass 1: MiniCPM-V visual extraction try { const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V'); addResult(pass1Result, 'Pass 1 MiniCPM-V'); } catch (err) { console.log(`[Pass 1] Error: ${err}`); } // Pass 2: PaddleOCR-VL table recognition (if available) if (paddleOCRVLAvailable) { try { const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL'); const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL'); if (count >= 2) { console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!'); return pass2Result; } } catch (err) { console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`); } } // Pass 3+: Continue with MiniCPM-V visual passes const startPass = paddleOCRVLAvailable ? 3 : 2; for (let pass = startPass; pass <= maxPasses; pass++) { try { const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`); const count = addResult(transactions, `Pass ${pass} MiniCPM-V`); if (count >= 2) { console.log(`[Consensus] Reached after ${pass} passes`); return transactions; } console.log(`[Pass ${pass}] No consensus yet, trying again...`); } catch (err) { console.log(`[Pass ${pass}] Error: ${err}`); } } // No consensus reached - return the most common result let bestHash = ''; let bestCount = 0; for (const [hash, count] of hashCounts) { if (count > bestCount) { bestCount = count; bestHash = hash; } } if (!bestHash) { throw new Error('No valid results obtained'); } const best = results.find((r) => r.hash === bestHash)!; console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); return best.transactions; } /** * Compare extracted transactions against expected */ function compareTransactions( extracted: ITransaction[], expected: ITransaction[] ): { matches: number; total: number; errors: string[] } { const errors: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { const exp = expected[i]; const ext = extracted[i]; if (!ext) { errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); continue; } const dateMatch = ext.date === exp.date; const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; if (dateMatch && amountMatch) { matches++; } else { errors.push( `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}` ); } } if (extracted.length > expected.length) { errors.push(`Extra transactions: ${extracted.length - expected.length}`); } return { matches, total: expected.length, errors }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } return testCases; } // Tests tap.test('should connect to Ollama API', async () => { const response = await fetch(`${OLLAMA_URL}/api/tags`); expect(response.ok).toBeTrue(); const data = await response.json(); expect(data.models).toBeArray(); }); tap.test('should have MiniCPM-V 4.5 model loaded', async () => { const response = await fetch(`${OLLAMA_URL}/api/tags`); const data = await response.json(); const modelNames = data.models.map((m: { name: string }) => m.name); expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); }); tap.test('should check PaddleOCR-VL availability', async () => { const available = await isPaddleOCRVLAvailable(); console.log(`PaddleOCR-VL available: ${available}`); // This test passes regardless - PaddleOCR-VL is optional expect(true).toBeTrue(); }); // Dynamic test for each PDF/JSON pair const testCases = findTestCases(); for (const testCase of testCases) { tap.test(`should extract transactions from ${testCase.name}`, async () => { // Load expected transactions const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.length} transactions`); // Convert PDF to images console.log('Converting PDF to images...'); const images = convertPdfToImages(testCase.pdfPath); console.log(`Converted: ${images.length} pages\n`); // Extract with dual-VLM consensus const extracted = await extractWithConsensus(images); console.log(`\nFinal: ${extracted.length} transactions`); // Compare results const result = compareTransactions(extracted, expected); console.log(`Accuracy: ${result.matches}/${result.total}`); if (result.errors.length > 0) { console.log('Errors:'); result.errors.forEach((e) => console.log(` - ${e}`)); } // Assert high accuracy const accuracy = result.matches / result.total; expect(accuracy).toBeGreaterThan(0.95); expect(extracted.length).toEqual(expected.length); }); } export default tap.start();