/** * Invoice extraction test using PaddleOCR-VL Full Pipeline * * This tests the complete PaddleOCR-VL pipeline: * 1. PP-DocLayoutV2 for layout detection * 2. PaddleOCR-VL for recognition * 3. Structured HTML output (semantic tags with proper tables) * 4. Qwen2.5 extracts invoice fields from structured HTML * * HTML output is used instead of Markdown because: * - tags are unambiguous (no parser variations) * - LLMs are heavily trained on web/HTML data * - Semantic tags (header, footer, section) provide clear structure */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; import { ensurePaddleOcrVlFull, ensureQwen25 } from './helpers/docker.js'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; const OLLAMA_URL = 'http://localhost:11434'; // Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused) const TEXT_MODEL = 'qwen2.5:7b'; interface IInvoice { invoice_number: string; invoice_date: string; vendor_name: string; currency: string; net_amount: number; vat_amount: number; total_amount: number; } /** * Convert PDF to PNG images using ImageMagick */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { execSync( `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); const images: string[] = []; for (const file of files) { const imagePath = path.join(tempDir, file); const imageData = fs.readFileSync(imagePath); images.push(imageData.toString('base64')); } return images; } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } } /** * Parse document using PaddleOCR-VL Full Pipeline (returns structured HTML) */ async function parseDocument(imageBase64: string): Promise { const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ image: imageBase64, output_format: 'html', }), }); if (!response.ok) { const text = await response.text(); throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); } const data = await response.json(); if (!data.success) { throw new Error(`PaddleOCR-VL error: ${data.error}`); } return data.result?.html || ''; } /** * Extract invoice fields using simple direct prompt * The OCR output has clearly labeled fields - just ask the LLM to read them */ async function extractInvoiceFromHtml(html: string): Promise { // OCR output is already good - just truncate if too long const truncated = html.length > 32000 ? html.slice(0, 32000) : html; console.log(` [Extract] ${truncated.length} chars of HTML`); // JSON schema for structured output const invoiceSchema = { type: 'object', properties: { invoice_number: { type: 'string' }, invoice_date: { type: 'string' }, vendor_name: { type: 'string' }, currency: { type: 'string' }, net_amount: { type: 'number' }, vat_amount: { type: 'number' }, total_amount: { type: 'number' }, }, required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], }; // Simple, direct prompt - the OCR output already has labeled fields const systemPrompt = `You read invoice HTML and extract labeled fields. Return JSON only.`; const userPrompt = `Extract from this invoice HTML: - invoice_number: Find "Invoice no.", "Invoice #", "Invoice", "Rechnung", "Document No" and extract the value - invoice_date: Find "Invoice date", "Date", "Datum" and convert to YYYY-MM-DD format - vendor_name: The company name issuing the invoice (in header/letterhead) - currency: EUR, USD, or GBP (look for € $ £ symbols or text) - total_amount: Find "Total", "Grand Total", "Amount Due", "Gesamtbetrag" - the FINAL total amount - net_amount: Amount before VAT/tax (Subtotal, Net) - vat_amount: VAT/tax amount HTML: ${truncated} Return ONLY valid JSON: {"invoice_number":"...", "invoice_date":"YYYY-MM-DD", "vendor_name":"...", "currency":"EUR", "net_amount":0, "vat_amount":0, "total_amount":0}`; const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: TEXT_MODEL, messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userPrompt }, ], format: invoiceSchema, stream: true, options: { num_predict: 512, temperature: 0.0 }, }), }); if (!response.ok) { throw new Error(`Ollama API error: ${response.status}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); let fullText = ''; while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = chunk.split('\n').filter((l) => l.trim()); for (const line of lines) { try { const json = JSON.parse(line); if (json.message?.content) { fullText += json.message.content; } else if (json.response) { fullText += json.response; } } catch { // Skip invalid JSON lines } } } // Extract JSON from response let jsonStr = fullText.trim(); // Remove markdown code block if present if (jsonStr.startsWith('```json')) { jsonStr = jsonStr.slice(7); } else if (jsonStr.startsWith('```')) { jsonStr = jsonStr.slice(3); } if (jsonStr.endsWith('```')) { jsonStr = jsonStr.slice(0, -3); } jsonStr = jsonStr.trim(); // Find JSON object boundaries const startIdx = jsonStr.indexOf('{'); const endIdx = jsonStr.lastIndexOf('}') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); } jsonStr = jsonStr.substring(startIdx, endIdx); let parsed; try { parsed = JSON.parse(jsonStr); } catch (e) { throw new Error(`Invalid JSON: ${jsonStr.substring(0, 200)}`); } // Normalize response to expected format return { invoice_number: parsed.invoice_number || null, invoice_date: parsed.invoice_date || null, vendor_name: parsed.vendor_name || null, currency: parsed.currency || 'EUR', net_amount: parseFloat(parsed.net_amount) || 0, vat_amount: parseFloat(parsed.vat_amount) || 0, total_amount: parseFloat(parsed.total_amount) || 0, }; } /** * Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only) * Processes ALL pages and concatenates HTML for multi-page invoice support */ async function extractOnce(images: string[], passNum: number): Promise { // Parse ALL pages and concatenate HTML with page markers const htmlParts: string[] = []; for (let i = 0; i < images.length; i++) { const pageHtml = await parseDocument(images[i]); // Add page marker for context htmlParts.push(`\n${pageHtml}`); } const fullHtml = htmlParts.join('\n\n'); console.log(` [Parse] Got ${fullHtml.split('\n').length} lines from ${images.length} page(s)`); // Extract invoice fields from HTML using text-only model (no images) return extractInvoiceFromHtml(fullHtml); } /** * Create a hash of invoice for comparison (using key fields) */ function hashInvoice(invoice: IInvoice): string { // Ensure total_amount is a number const amount = typeof invoice.total_amount === 'number' ? invoice.total_amount.toFixed(2) : String(invoice.total_amount || 0); return `${invoice.invoice_number}|${invoice.invoice_date}|${amount}`; } /** * Extract with consensus voting */ async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { const results: Array<{ invoice: IInvoice; hash: string }> = []; const hashCounts: Map = new Map(); const addResult = (invoice: IInvoice, passLabel: string): number => { const hash = hashInvoice(invoice); results.push({ invoice, hash }); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); return hashCounts.get(hash)!; }; for (let pass = 1; pass <= maxPasses; pass++) { try { const invoice = await extractOnce(images, pass); const count = addResult(invoice, `Pass ${pass}`); if (count >= 2) { console.log(` [Consensus] Reached after ${pass} passes`); return invoice; } } catch (err) { console.log(` [Pass ${pass}] Error: ${err}`); } } // No consensus reached - return the most common result let bestHash = ''; let bestCount = 0; for (const [hash, count] of hashCounts) { if (count > bestCount) { bestCount = count; bestHash = hash; } } if (!bestHash) { throw new Error(`No valid results for ${invoiceName}`); } const best = results.find((r) => r.hash === bestHash)!; console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); return best.invoice; } /** * Normalize date to YYYY-MM-DD format */ function normalizeDate(dateStr: string | null): string { if (!dateStr) return ''; // Already in correct format if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) { return dateStr; } // Handle DD-MMM-YYYY format (e.g., "28-JUN-2022") const monthMap: Record = { JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', }; const match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); if (match) { const day = match[1].padStart(2, '0'); const month = monthMap[match[2].toUpperCase()] || '01'; const year = match[3]; return `${year}-${month}-${day}`; } // Handle DD/MM/YYYY or DD.MM.YYYY const match2 = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); if (match2) { const day = match2[1].padStart(2, '0'); const month = match2[2].padStart(2, '0'); const year = match2[3]; return `${year}-${month}-${day}`; } return dateStr; } /** * Compare extracted invoice against expected */ function compareInvoice( extracted: IInvoice, expected: IInvoice ): { match: boolean; errors: string[] } { const errors: string[] = []; // Compare invoice number (normalize by removing spaces and case) const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; if (extNum !== expNum) { errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); } // Compare date (normalize format first) const extDate = normalizeDate(extracted.invoice_date); const expDate = normalizeDate(expected.invoice_date); if (extDate !== expDate) { errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); } // Compare total amount (with tolerance) if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); } // Compare currency if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); } return { match: errors.length === 0, errors }; } /** * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit/invoices'); if (!fs.existsSync(testDir)) { return []; } const files = fs.readdirSync(testDir); const pdfFiles = files.filter((f) => f.endsWith('.pdf')); const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; for (const pdf of pdfFiles) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { testCases.push({ name: baseName, pdfPath: path.join(testDir, pdf), jsonPath: path.join(testDir, jsonFile), }); } } // Sort alphabetically testCases.sort((a, b) => a.name.localeCompare(b.name)); return testCases; } // Tests tap.test('setup: ensure Docker containers are running', async () => { console.log('\n[Setup] Checking Docker containers...\n'); // Ensure PaddleOCR-VL Full Pipeline is running const paddleOk = await ensurePaddleOcrVlFull(); expect(paddleOk).toBeTrue(); // Ensure Qwen2.5 is available (for text-only JSON extraction) const qwenOk = await ensureQwen25(); expect(qwenOk).toBeTrue(); console.log('\n[Setup] All containers ready!\n'); }); // Dynamic test for each PDF/JSON pair const testCases = findTestCases(); console.log(`\nFound ${testCases.length} invoice test cases (PaddleOCR-VL Full Pipeline)\n`); let passedCount = 0; let failedCount = 0; const processingTimes: number[] = []; for (const testCase of testCases) { tap.test(`should extract invoice: ${testCase.name}`, async () => { // Load expected data const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); console.log(`\n=== ${testCase.name} ===`); console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); const startTime = Date.now(); // Convert PDF to images const images = convertPdfToImages(testCase.pdfPath); console.log(` Pages: ${images.length}`); // Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM) const extracted = await extractWithConsensus(images, testCase.name); const endTime = Date.now(); const elapsedMs = endTime - startTime; processingTimes.push(elapsedMs); // Compare results const result = compareInvoice(extracted, expected); if (result.match) { passedCount++; console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); } else { failedCount++; console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); result.errors.forEach((e) => console.log(` - ${e}`)); } // Assert match expect(result.match).toBeTrue(); }); } tap.test('summary', async () => { const totalInvoices = testCases.length; const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; const avgTimeSec = avgTimeMs / 1000; const totalTimeSec = totalTimeMs / 1000; console.log(`\n======================================================`); console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`); console.log(`======================================================`); console.log(` Method: PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only)`); console.log(` Passed: ${passedCount}/${totalInvoices}`); console.log(` Failed: ${failedCount}/${totalInvoices}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`); console.log(`------------------------------------------------------`); console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); console.log(`======================================================\n`); }); export default tap.start();