/** * Invoice extraction test using PaddleOCR-VL Full Pipeline * * This tests the complete PaddleOCR-VL pipeline: * 1. PP-DocLayoutV2 for layout detection * 2. PaddleOCR-VL for recognition * 3. Structured Markdown output * 4. MiniCPM extracts invoice fields from structured Markdown * * The structured Markdown has proper tables and formatting, * making it much easier for MiniCPM to extract invoice data. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; const OLLAMA_URL = 'http://localhost:11434'; const MINICPM_MODEL = 'minicpm-v:latest'; interface IInvoice { invoice_number: string; invoice_date: string; vendor_name: string; currency: string; net_amount: number; vat_amount: number; total_amount: number; } /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown) */ async function parseDocument(imageBase64: string): Promise { const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ image: imageBase64, output_format: 'markdown', }), }); if (!response.ok) { const text = await response.text(); throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); } const data = await response.json(); if (!data.success) { throw new Error(`PaddleOCR-VL error: ${data.error}`); } return data.result?.markdown || ''; } /** * Extract invoice fields from structured Markdown using MiniCPM with image context */ async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise { // Truncate if too long const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown; console.log(` [Extract] Processing ${truncated.length} chars of Markdown`); const prompt = `/nothink You are an invoice parser. Extract fields from this invoice image. Required fields: - invoice_number: The invoice/receipt number - invoice_date: Date in YYYY-MM-DD format - vendor_name: Company that issued the invoice - currency: EUR, USD, etc. - net_amount: Amount before tax - vat_amount: Tax/VAT amount (0 if reverse charge) - total_amount: Final amount due Return ONLY a JSON object like: {"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99} Use null for missing strings, 0 for missing numbers. No explanation. OCR text from the invoice (for reference): --- ${truncated} ---`; const payload = { model: MINICPM_MODEL, prompt, images, // Send the actual image to MiniCPM stream: true, options: { num_predict: 2048, temperature: 0.1, }, }; const response = await fetch(`${OLLAMA_URL}/api/generate`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.response) { fullText += json.response; } } catch { // Skip invalid JSON lines } } } // Extract JSON from response const startIdx = fullText.indexOf('{'); const endIdx = fullText.lastIndexOf('}') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); } const jsonStr = fullText.substring(startIdx, endIdx); return JSON.parse(jsonStr); } /** * Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM */ async function extractOnce(images: string[], passNum: number): Promise { // Parse document with full pipeline const markdown = await parseDocument(images[0]); console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); // Extract invoice fields from Markdown with image context return extractInvoiceFromMarkdown(markdown, images); } /** * Create a hash of invoice for comparison (using key fields) */ function hashInvoice(invoice: IInvoice): string { return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; } /** * Extract with consensus voting */ async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { const results: Array<{ invoice: IInvoice; hash: string }> = []; const hashCounts: Map = new Map(); const addResult = (invoice: IInvoice, passLabel: string): number => { const hash = hashInvoice(invoice); results.push({ invoice, hash }); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); return hashCounts.get(hash)!; }; for (let pass = 1; pass <= maxPasses; pass++) { try { const invoice = await extractOnce(images, pass); const count = addResult(invoice, `Pass ${pass}`); if (count >= 2) { console.log(` [Consensus] Reached after ${pass} passes`); return invoice; } } catch (err) { console.log(` [Pass ${pass}] Error: ${err}`); } } // No consensus reached - return the most common result let bestHash = ''; let bestCount = 0; for (const [hash, count] of hashCounts) { if (count > bestCount) { bestCount = count; bestHash = hash; } } if (!bestHash) { throw new Error(`No valid results for ${invoiceName}`); } const best = results.find((r) => r.hash === bestHash)!; console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); return best.invoice; } /** * Compare extracted invoice against expected */ function compareInvoice( extracted: IInvoice, expected: IInvoice ): { match: boolean; errors: string[] } { const errors: string[] = []; // Compare invoice number (normalize by removing spaces and case) const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; if (extNum !== expNum) { errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); } // Compare date if (extracted.invoice_date !== expected.invoice_date) { errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); } // Compare total amount (with tolerance) if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); } // Compare currency if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); } return { match: errors.length === 0, errors }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit/invoices'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } // Sort alphabetically testCases.sort((a, b) => a.name.localeCompare(b.name)); return testCases; } // Tests tap.test('setup: ensure Docker containers are running', async () => { console.log('\n[Setup] Checking Docker containers...\n'); // Ensure PaddleOCR-VL Full Pipeline is running const paddleOk = await ensurePaddleOcrVlFull(); expect(paddleOk).toBeTrue(); // Ensure MiniCPM is running (for field extraction from Markdown) const minicpmOk = await ensureMiniCpm(); expect(minicpmOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); // Dynamic test for each PDF/JSON pair const testCases = findTestCases(); console.log(`\nFound ${testCases.length} invoice test cases (PaddleOCR-VL Full Pipeline)\n`); let passedCount = 0; let failedCount = 0; const processingTimes: number[] = []; for (const testCase of testCases) { tap.test(`should extract invoice: ${testCase.name}`, async () => { // Load expected data const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); const startTime = Date.now(); // Convert PDF to images const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); // Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM) const extracted = await extractWithConsensus(images, testCase.name); const endTime = Date.now(); const elapsedMs = endTime - startTime; processingTimes.push(elapsedMs); // Compare results const result = compareInvoice(extracted, expected); if (result.match) { passedCount++; console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); } else { failedCount++; console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); result.errors.forEach((e) => console.log(` - ${e}`)); } // Assert match expect(result.match).toBeTrue(); }); } tap.test('summary', async () => { const totalInvoices = testCases.length; const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; const avgTimeSec = avgTimeMs / 1000; const totalTimeSec = totalTimeMs / 1000; console.log(`\n======================================================`); console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`); console.log(`======================================================`); console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); console.log(` Passed: ${passedCount}/${totalInvoices}`); console.log(` Failed: ${failedCount}/${totalInvoices}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`); console.log(`------------------------------------------------------`); console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); console.log(`======================================================\n`); }); export default tap.start();