/** * Invoice extraction test using MiniCPM-V (visual) + PaddleOCR-VL (OCR augmentation) * * This is the combined approach that uses both models for best accuracy: * - MiniCPM-V for visual understanding * - PaddleOCR-VL for OCR text to augment prompts */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; const MODEL = 'minicpm-v:latest'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; interface IInvoice { invoice_number: string; invoice_date: string; vendor_name: string; currency: string; net_amount: number; vat_amount: number; total_amount: number; } /** * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API) */ async function extractOcrText(imageBase64: string): Promise { try { const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'paddleocr-vl', messages: [{ role: 'user', content: [ { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } }, { type: 'text', text: 'OCR:' } ] }], temperature: 0.0, max_tokens: 4096 }), }); if (!response.ok) return ''; const data = await response.json(); return data.choices?.[0]?.message?.content || ''; } catch { // PaddleOCR-VL unavailable } return ''; } /** * Build prompt with optional OCR text */ function buildPrompt(ocrText: string): string { const base = `/nothink You are an invoice parser. Extract the following fields from this invoice: 1. invoice_number: The invoice/receipt number 2. invoice_date: Date in YYYY-MM-DD format 3. vendor_name: Company that issued the invoice 4. currency: EUR, USD, etc. 5. net_amount: Amount before tax (if shown) 6. vat_amount: Tax/VAT amount (if shown, 0 if reverse charge or no tax) 7. total_amount: Final amount due Return ONLY valid JSON in this exact format: {"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company Name","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00} If a field is not visible, use null for strings or 0 for numbers. No explanation, just the JSON object.`; if (ocrText) { // Limit OCR text to prevent context overflow const maxOcrLength = 4000; const truncatedOcr = ocrText.length > maxOcrLength ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)' : ocrText; return `${base} OCR text extracted from the invoice (use for reference): --- ${truncatedOcr} --- Cross-reference the image with the OCR text above for accuracy.`; } return base; } /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Single extraction pass */ async function extractOnce(images: string[], passNum: number, ocrText: string = ''): Promise { const payload = { model: MODEL, prompt: buildPrompt(ocrText), images, stream: true, options: { num_predict: 2048, temperature: 0.1, }, }; const response = await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.response) { fullText += json.response; } } catch { // Skip invalid JSON lines } } } // Extract JSON from response const startIdx = fullText.indexOf('{'); const endIdx = fullText.lastIndexOf('}') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); } const jsonStr = fullText.substring(startIdx, endIdx); return JSON.parse(jsonStr); } /** * Create a hash of invoice for comparison (using key fields) */ function hashInvoice(invoice: IInvoice): string { return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; } /** * Extract with majority voting - run until 2 passes match * Optimization: Run Pass 1, OCR, and Pass 2 (after OCR) in parallel */ async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { const results: Array<{ invoice: IInvoice; hash: string }> = []; const hashCounts: Map = new Map(); const addResult = (invoice: IInvoice, passLabel: string): number => { const hash = hashInvoice(invoice); results.push({ invoice, hash }); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); return hashCounts.get(hash)!; }; // OPTIMIZATION: Run Pass 1 (no OCR) in parallel with OCR -> Pass 2 (with OCR) let ocrText = ''; const pass1Promise = extractOnce(images, 1, '').catch((err) => ({ error: err })); // OCR then immediately Pass 2 const ocrThenPass2Promise = (async () => { ocrText = await extractOcrText(images[0]); if (ocrText) { console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`); } return extractOnce(images, 2, ocrText).catch((err) => ({ error: err })); })(); // Wait for both to complete const [pass1Result, pass2Result] = await Promise.all([pass1Promise, ocrThenPass2Promise]); // Process Pass 1 result if ('error' in pass1Result) { console.log(` [Pass 1] Error: ${(pass1Result as {error: unknown}).error}`); } else { const count = addResult(pass1Result as IInvoice, 'Pass 1'); if (count >= 2) { console.log(` [Consensus] Reached after parallel passes`); return pass1Result as IInvoice; } } // Process Pass 2 result if ('error' in pass2Result) { console.log(` [Pass 2+OCR] Error: ${(pass2Result as {error: unknown}).error}`); } else { const count = addResult(pass2Result as IInvoice, 'Pass 2+OCR'); if (count >= 2) { console.log(` [Consensus] Reached after parallel passes`); return pass2Result as IInvoice; } } // Continue with passes 3+ using OCR text if no consensus yet for (let pass = 3; pass <= maxPasses; pass++) { try { const invoice = await extractOnce(images, pass, ocrText); const count = addResult(invoice, `Pass ${pass}+OCR`); if (count >= 2) { console.log(` [Consensus] Reached after ${pass} passes`); return invoice; } } catch (err) { console.log(` [Pass ${pass}] Error: ${err}`); } } // No consensus reached - return the most common result let bestHash = ''; let bestCount = 0; for (const [hash, count] of hashCounts) { if (count > bestCount) { bestCount = count; bestHash = hash; } } if (!bestHash) { throw new Error(`No valid results for ${invoiceName}`); } const best = results.find((r) => r.hash === bestHash)!; console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); return best.invoice; } /** * Compare extracted invoice against expected */ function compareInvoice( extracted: IInvoice, expected: IInvoice ): { match: boolean; errors: string[] } { const errors: string[] = []; // Compare invoice number (normalize by removing spaces and case) const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; if (extNum !== expNum) { errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); } // Compare date if (extracted.invoice_date !== expected.invoice_date) { errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); } // Compare total amount (with tolerance) if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); } // Compare currency if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); } return { match: errors.length === 0, errors }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ * Priority invoices (like vodafone) run first for quick feedback */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit/invoices'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } // Sort with priority invoices first, then alphabetically const priorityPrefixes = ['vodafone']; testCases.sort((a, b) => { const aPriority = priorityPrefixes.findIndex((p) => a.name.startsWith(p)); const bPriority = priorityPrefixes.findIndex((p) => b.name.startsWith(p)); // Both have priority - sort by priority order if (aPriority >= 0 && bPriority >= 0) return aPriority - bPriority; // Only a has priority - a comes first if (aPriority >= 0) return -1; // Only b has priority - b comes first if (bPriority >= 0) return 1; // Neither has priority - alphabetical return a.name.localeCompare(b.name); }); return testCases; } // Tests tap.test('setup: ensure Docker containers are running', async () => { console.log('\n[Setup] Checking Docker containers...\n'); // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU) const paddleOk = await ensurePaddleOcrVl(); expect(paddleOk).toBeTrue(); // Ensure MiniCPM is running const minicpmOk = await ensureMiniCpm(); expect(minicpmOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); tap.test('should have MiniCPM-V 4.5 model loaded', async () => { const response = await fetch(`${OLLAMA_URL}/api/tags`); const data = await response.json(); const modelNames = data.models.map((m: { name: string }) => m.name); expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); }); // Dynamic test for each PDF/JSON pair const testCases = findTestCases(); console.log(`\nFound ${testCases.length} invoice test cases\n`); let passedCount = 0; let failedCount = 0; const processingTimes: number[] = []; for (const testCase of testCases) { tap.test(`should extract invoice: ${testCase.name}`, async () => { // Load expected data const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); const startTime = Date.now(); // Convert PDF to images const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); // Extract with consensus voting const extracted = await extractWithConsensus(images, testCase.name); const endTime = Date.now(); const elapsedMs = endTime - startTime; processingTimes.push(elapsedMs); // Compare results const result = compareInvoice(extracted, expected); if (result.match) { passedCount++; console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); } else { failedCount++; console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); result.errors.forEach((e) => console.log(` - ${e}`)); } // Assert match expect(result.match).toBeTrue(); }); } tap.test('summary', async () => { const totalInvoices = testCases.length; const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; const avgTimeSec = avgTimeMs / 1000; const totalTimeSec = totalTimeMs / 1000; console.log(`\n========================================`); console.log(` Invoice Extraction Summary`); console.log(`========================================`); console.log(` Passed: ${passedCount}/${totalInvoices}`); console.log(` Failed: ${failedCount}/${totalInvoices}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`); console.log(`----------------------------------------`); console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); console.log(`========================================\n`); }); export default tap.start();