diff --git a/changelog.md b/changelog.md index 81874e8..cd84d9c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-18 - 1.9.0 - feat(tests) +add Ministral 3 vision tests and improve invoice extraction pipeline to use Ollama chat schema, sanitization, and multi-page support + +- Add new vision-based test suites for Ministral 3: test/test.invoices.ministral3.ts and test/test.bankstatements.ministral3.ts (model ministral-3:8b). +- Introduce ensureMinistral3() helper to start/check Ollama/MiniCPM model in test/helpers/docker.ts. +- Switch invoice extraction to use Ollama /api/chat with a JSON schema (format) and streaming support (reads message.content). +- Improve HTML handling: sanitizeHtml() to remove OCR artifacts, concatenate multi-page HTML with page markers, and increase truncation limits. +- Enhance response parsing: strip Markdown code fences, robustly locate JSON object boundaries, and provide clearer JSON parse errors. +- Add PDF->PNG conversion (ImageMagick) and direct image-based extraction flow for vision model tests. + ## 2026-01-18 - 1.8.0 - feat(paddleocr-vl) add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README diff --git a/test/helpers/docker.ts b/test/helpers/docker.ts index 49b8075..e60192a 100644 --- a/test/helpers/docker.ts +++ b/test/helpers/docker.ts @@ -358,3 +358,16 @@ export async function ensureQwen25(): Promise { // Then ensure the Qwen2.5 model is pulled return ensureOllamaModel('qwen2.5:7b'); } + +/** + * Ensure Ministral 3 8B model is available (for structured JSON extraction) + * Ministral 3 has native JSON output support and OCR-style document extraction + */ +export async function ensureMinistral3(): Promise { + // First ensure the Ollama service (MiniCPM container) is running + const ollamaOk = await ensureMiniCpm(); + if (!ollamaOk) return false; + + // Then ensure the Ministral 3 8B model is pulled + return ensureOllamaModel('ministral-3:8b'); +} diff --git a/test/test.bankstatements.ministral3.ts b/test/test.bankstatements.ministral3.ts new file mode 100644 index 0000000..c309773 --- /dev/null +++ b/test/test.bankstatements.ministral3.ts @@ -0,0 +1,348 @@ +/** + * Bank Statement extraction using Ministral 3 Vision (Direct) + * + * NO OCR pipeline needed - Ministral 3 has built-in vision encoder: + * 1. Convert PDF to images + * 2. Send images directly to Ministral 3 via Ollama + * 3. Extract transactions as structured JSON + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureMinistral3 } from './helpers/docker.js'; + +const OLLAMA_URL = 'http://localhost:11434'; +const VISION_MODEL = 'ministral-3:8b'; + +interface ITransaction { + date: string; + counterparty: string; + amount: number; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Extract transactions from a single page image using Ministral 3 Vision + */ +async function extractTransactionsFromPage(image: string, pageNum: number): Promise { + console.log(` [Vision] Processing page ${pageNum}`); + + // JSON schema for array of transactions + const transactionSchema = { + type: 'array', + items: { + type: 'object', + properties: { + date: { type: 'string', description: 'Transaction date in YYYY-MM-DD format' }, + counterparty: { type: 'string', description: 'Name of the other party' }, + amount: { type: 'number', description: 'Amount (negative for debits, positive for credits)' }, + }, + required: ['date', 'counterparty', 'amount'], + }, + }; + + const prompt = `Extract ALL bank transactions from this bank statement page. + +For each transaction, extract: +- date: Transaction date in YYYY-MM-DD format +- counterparty: The name/description of the other party (merchant, payee, etc.) +- amount: The amount as a number (NEGATIVE for debits/expenses, POSITIVE for credits/income) + +Return a JSON array of transactions. If no transactions visible, return empty array []. +Example: [{"date":"2021-06-01","counterparty":"AMAZON","amount":-50.00}]`; + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: VISION_MODEL, + messages: [ + { + role: 'user', + content: prompt, + images: [image], + }, + ], + format: transactionSchema, + stream: true, + options: { + num_predict: 4096, // Bank statements can have many transactions + temperature: 0.0, + }, + }), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.message?.content) { + fullText += json.message.content; + } + } catch { + // Skip invalid JSON lines + } + } + } + + // Parse JSON response + let jsonStr = fullText.trim(); + + if (jsonStr.startsWith('```json')) jsonStr = jsonStr.slice(7); + else if (jsonStr.startsWith('```')) jsonStr = jsonStr.slice(3); + if (jsonStr.endsWith('```')) jsonStr = jsonStr.slice(0, -3); + jsonStr = jsonStr.trim(); + + // Find array boundaries + const startIdx = jsonStr.indexOf('['); + const endIdx = jsonStr.lastIndexOf(']') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + console.log(` [Page ${pageNum}] No transactions found`); + return []; + } + + try { + const parsed = JSON.parse(jsonStr.substring(startIdx, endIdx)); + console.log(` [Page ${pageNum}] Found ${parsed.length} transactions`); + return parsed.map((t: { date?: string; counterparty?: string; amount?: number }) => ({ + date: t.date || '', + counterparty: t.counterparty || '', + amount: parseFloat(String(t.amount)) || 0, + })); + } catch (e) { + console.log(` [Page ${pageNum}] Parse error: ${e}`); + return []; + } +} + +/** + * Extract all transactions from all pages + */ +async function extractAllTransactions(images: string[]): Promise { + const allTransactions: ITransaction[] = []; + + for (let i = 0; i < images.length; i++) { + const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); + allTransactions.push(...pageTransactions); + } + + return allTransactions; +} + +/** + * Normalize date to YYYY-MM-DD + */ +function normalizeDate(dateStr: string): string { + if (!dateStr) return ''; + if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr; + + // Handle DD/MM/YYYY or DD.MM.YYYY + const match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); + if (match) { + return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`; + } + + return dateStr; +} + +/** + * Compare extracted transactions vs expected + */ +function compareTransactions( + extracted: ITransaction[], + expected: ITransaction[] +): { matchRate: number; matched: number; missed: number; extra: number; errors: string[] } { + const errors: string[] = []; + let matched = 0; + + // Normalize all dates + const normalizedExtracted = extracted.map((t) => ({ + ...t, + date: normalizeDate(t.date), + counterparty: t.counterparty.toUpperCase().trim(), + })); + + const normalizedExpected = expected.map((t) => ({ + ...t, + date: normalizeDate(t.date), + counterparty: t.counterparty.toUpperCase().trim(), + })); + + // Try to match each expected transaction + const matchedIndices = new Set(); + + for (const exp of normalizedExpected) { + let found = false; + + for (let i = 0; i < normalizedExtracted.length; i++) { + if (matchedIndices.has(i)) continue; + + const ext = normalizedExtracted[i]; + + // Match by date + amount (counterparty names can vary) + if (ext.date === exp.date && Math.abs(ext.amount - exp.amount) < 0.02) { + matched++; + matchedIndices.add(i); + found = true; + break; + } + } + + if (!found) { + errors.push(`Missing: ${exp.date} | ${exp.counterparty} | ${exp.amount}`); + } + } + + const missed = expected.length - matched; + const extra = extracted.length - matched; + const matchRate = expected.length > 0 ? (matched / expected.length) * 100 : 0; + + return { matchRate, matched, missed, extra, errors }; +} + +/** + * Find test cases (PDF + JSON pairs in .nogit/) + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit'); + if (!fs.existsSync(testDir)) return []; + + const files = fs.readdirSync(testDir); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of files.filter((f) => f.endsWith('.pdf'))) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + // Skip invoice files - only bank statements + if (!baseName.includes('invoice')) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + } + + return testCases.sort((a, b) => a.name.localeCompare(b.name)); +} + +// Tests + +tap.test('setup: ensure Ministral 3 is running', async () => { + console.log('\n[Setup] Checking Ministral 3...\n'); + const ok = await ensureMinistral3(); + expect(ok).toBeTrue(); + console.log('\n[Setup] Ready!\n'); +}); + +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} bank statement test cases (Ministral 3 Vision)\n`); + +let totalMatched = 0; +let totalExpected = 0; +const times: number[] = []; + +for (const testCase of testCases) { + tap.test(`should extract bank statement: ${testCase.name}`, async () => { + const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.length} transactions`); + + const start = Date.now(); + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + const extracted = await extractAllTransactions(images); + const elapsed = Date.now() - start; + times.push(elapsed); + + console.log(` Extracted: ${extracted.length} transactions`); + + const result = compareTransactions(extracted, expected); + totalMatched += result.matched; + totalExpected += expected.length; + + console.log(` Match rate: ${result.matchRate.toFixed(1)}% (${result.matched}/${expected.length})`); + console.log(` Missed: ${result.missed}, Extra: ${result.extra}`); + console.log(` Time: ${(elapsed / 1000).toFixed(1)}s`); + + if (result.errors.length > 0 && result.errors.length <= 5) { + result.errors.forEach((e) => console.log(` - ${e}`)); + } else if (result.errors.length > 5) { + console.log(` (${result.errors.length} missing transactions)`); + } + + // Consider it a pass if we match at least 70% of transactions + expect(result.matchRate).toBeGreaterThan(70); + }); +} + +tap.test('summary', async () => { + const overallMatchRate = totalExpected > 0 ? (totalMatched / totalExpected) * 100 : 0; + const totalTime = times.reduce((a, b) => a + b, 0) / 1000; + const avgTime = times.length > 0 ? totalTime / times.length : 0; + + console.log(`\n======================================================`); + console.log(` Bank Statement Extraction Summary (Ministral 3)`); + console.log(`======================================================`); + console.log(` Method: Ministral 3 8B Vision (Direct)`); + console.log(` Statements: ${testCases.length}`); + console.log(` Matched: ${totalMatched}/${totalExpected} transactions`); + console.log(` Match rate: ${overallMatchRate.toFixed(1)}%`); + console.log(`------------------------------------------------------`); + console.log(` Total time: ${totalTime.toFixed(1)}s`); + console.log(` Avg per stmt: ${avgTime.toFixed(1)}s`); + console.log(`======================================================\n`); +}); + +export default tap.start(); diff --git a/test/test.invoices.ministral3.ts b/test/test.invoices.ministral3.ts new file mode 100644 index 0000000..6d53d28 --- /dev/null +++ b/test/test.invoices.ministral3.ts @@ -0,0 +1,355 @@ +/** + * Invoice extraction using Ministral 3 Vision (Direct) + * + * NO PaddleOCR needed - Ministral 3 has built-in vision encoder: + * 1. Convert PDF to images + * 2. Send images directly to Ministral 3 via Ollama + * 3. Extract structured JSON with native schema support + * + * This is the simplest possible pipeline. + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureMinistral3 } from './helpers/docker.js'; + +const OLLAMA_URL = 'http://localhost:11434'; +const VISION_MODEL = 'ministral-3:8b'; + +interface IInvoice { + invoice_number: string; + invoice_date: string; + vendor_name: string; + currency: string; + net_amount: number; + vat_amount: number; + total_amount: number; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Extract invoice data directly from images using Ministral 3 Vision + */ +async function extractInvoiceFromImages(images: string[]): Promise { + console.log(` [Vision] Processing ${images.length} page(s) with Ministral 3`); + + // JSON schema for structured output + const invoiceSchema = { + type: 'object', + properties: { + invoice_number: { type: 'string' }, + invoice_date: { type: 'string' }, + vendor_name: { type: 'string' }, + currency: { type: 'string' }, + net_amount: { type: 'number' }, + vat_amount: { type: 'number' }, + total_amount: { type: 'number' }, + }, + required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], + }; + + const prompt = `Extract invoice data from this document image(s). + +Find and return: +- invoice_number: The invoice number/ID (look for "Invoice No", "Invoice #", "Rechnung Nr") +- invoice_date: The invoice date in YYYY-MM-DD format +- vendor_name: The company issuing the invoice (in letterhead) +- currency: EUR, USD, or GBP +- total_amount: The FINAL total amount due +- net_amount: Amount before VAT/tax +- vat_amount: VAT/tax amount + +Return ONLY valid JSON.`; + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: VISION_MODEL, + messages: [ + { + role: 'user', + content: prompt, + images: images, // Send all page images + }, + ], + format: invoiceSchema, + stream: true, + options: { + num_predict: 512, + temperature: 0.0, + }, + }), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.message?.content) { + fullText += json.message.content; + } + } catch { + // Skip invalid JSON lines + } + } + } + + // Parse JSON response + let jsonStr = fullText.trim(); + + if (jsonStr.startsWith('```json')) jsonStr = jsonStr.slice(7); + else if (jsonStr.startsWith('```')) jsonStr = jsonStr.slice(3); + if (jsonStr.endsWith('```')) jsonStr = jsonStr.slice(0, -3); + jsonStr = jsonStr.trim(); + + const startIdx = jsonStr.indexOf('{'); + const endIdx = jsonStr.lastIndexOf('}') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + throw new Error(`No JSON found: ${fullText.substring(0, 200)}`); + } + + const parsed = JSON.parse(jsonStr.substring(startIdx, endIdx)); + + return { + invoice_number: parsed.invoice_number || null, + invoice_date: parsed.invoice_date || null, + vendor_name: parsed.vendor_name || null, + currency: parsed.currency || 'EUR', + net_amount: parseFloat(parsed.net_amount) || 0, + vat_amount: parseFloat(parsed.vat_amount) || 0, + total_amount: parseFloat(parsed.total_amount) || 0, + }; +} + +/** + * Extract with consensus voting (2 agreeing passes) + */ +async function extractWithConsensus(images: string[], name: string, maxPasses: number = 3): Promise { + const results: Array<{ invoice: IInvoice; hash: string }> = []; + const hashCounts: Map = new Map(); + + for (let pass = 1; pass <= maxPasses; pass++) { + try { + const invoice = await extractInvoiceFromImages(images); + const hash = `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount?.toFixed(2)}`; + results.push({ invoice, hash }); + hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); + + console.log(` [Pass ${pass}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); + + if (hashCounts.get(hash)! >= 2) { + console.log(` [Consensus] Reached after ${pass} passes`); + return invoice; + } + } catch (err) { + console.log(` [Pass ${pass}] Error: ${err}`); + } + } + + // Return most common result + let bestHash = ''; + let bestCount = 0; + for (const [hash, count] of hashCounts) { + if (count > bestCount) { + bestCount = count; + bestHash = hash; + } + } + + if (!bestHash) throw new Error(`No valid results for ${name}`); + + console.log(` [No consensus] Using best result (${bestCount}/${maxPasses})`); + return results.find((r) => r.hash === bestHash)!.invoice; +} + +/** + * Normalize date to YYYY-MM-DD + */ +function normalizeDate(dateStr: string | null): string { + if (!dateStr) return ''; + if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr; + + const monthMap: Record = { + JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', + JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', + }; + + let match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); + if (match) { + return `${match[3]}-${monthMap[match[2].toUpperCase()] || '01'}-${match[1].padStart(2, '0')}`; + } + + match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); + if (match) { + return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`; + } + + return dateStr; +} + +/** + * Compare extracted vs expected + */ +function compareInvoice(extracted: IInvoice, expected: IInvoice): { match: boolean; errors: string[] } { + const errors: string[] = []; + + const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + if (extNum !== expNum) { + errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); + } + + if (normalizeDate(extracted.invoice_date) !== normalizeDate(expected.invoice_date)) { + errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); + } + + if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { + errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); + } + + if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { + errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); + } + + return { match: errors.length === 0, errors }; +} + +/** + * Find test cases + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit/invoices'); + if (!fs.existsSync(testDir)) return []; + + const files = fs.readdirSync(testDir); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of files.filter((f) => f.endsWith('.pdf'))) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + return testCases.sort((a, b) => a.name.localeCompare(b.name)); +} + +// Tests + +tap.test('setup: ensure Ministral 3 is running', async () => { + console.log('\n[Setup] Checking Ministral 3...\n'); + const ok = await ensureMinistral3(); + expect(ok).toBeTrue(); + console.log('\n[Setup] Ready!\n'); +}); + +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} invoice test cases (Ministral 3 Vision Direct)\n`); + +let passedCount = 0; +let failedCount = 0; +const times: number[] = []; + +for (const testCase of testCases) { + tap.test(`should extract invoice: ${testCase.name}`, async () => { + const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); + + const start = Date.now(); + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + const extracted = await extractWithConsensus(images, testCase.name); + const elapsed = Date.now() - start; + times.push(elapsed); + + const result = compareInvoice(extracted, expected); + + if (result.match) { + passedCount++; + console.log(` Result: MATCH (${(elapsed / 1000).toFixed(1)}s)`); + } else { + failedCount++; + console.log(` Result: MISMATCH (${(elapsed / 1000).toFixed(1)}s)`); + result.errors.forEach((e) => console.log(` - ${e}`)); + } + + expect(result.match).toBeTrue(); + }); +} + +tap.test('summary', async () => { + const total = testCases.length; + const accuracy = total > 0 ? (passedCount / total) * 100 : 0; + const totalTime = times.reduce((a, b) => a + b, 0) / 1000; + const avgTime = times.length > 0 ? totalTime / times.length : 0; + + console.log(`\n======================================================`); + console.log(` Invoice Extraction Summary (Ministral 3 Vision)`); + console.log(`======================================================`); + console.log(` Method: Ministral 3 8B Vision (Direct)`); + console.log(` Passed: ${passedCount}/${total}`); + console.log(` Failed: ${failedCount}/${total}`); + console.log(` Accuracy: ${accuracy.toFixed(1)}%`); + console.log(`------------------------------------------------------`); + console.log(` Total time: ${totalTime.toFixed(1)}s`); + console.log(` Avg per inv: ${avgTime.toFixed(1)}s`); + console.log(`======================================================\n`); +}); + +export default tap.start(); diff --git a/test/test.invoices.paddleocr-vl.ts b/test/test.invoices.paddleocr-vl.ts index 9d01750..6db104e 100644 --- a/test/test.invoices.paddleocr-vl.ts +++ b/test/test.invoices.paddleocr-vl.ts @@ -90,61 +90,71 @@ async function parseDocument(imageBase64: string): Promise { } /** - * Extract invoice fields from structured HTML using Qwen2.5 (text-only model) + * Sanitize HTML to remove OCR artifacts that confuse the LLM + * Minimal cleaning - only remove truly problematic patterns + */ +function sanitizeHtml(html: string): string { + // Remove excessively repeated characters (OCR glitches) + let sanitized = html.replace(/(\d)\1{20,}/g, '$1...'); + // Remove extremely long strings (corrupted data) + sanitized = sanitized.replace(/\b[A-Za-z0-9]{50,}\b/g, '[OCR_ARTIFACT]'); + return sanitized; +} + +/** + * Extract invoice fields using simple direct prompt + * The OCR output has clearly labeled fields - just ask the LLM to read them */ async function extractInvoiceFromHtml(html: string): Promise { - // Truncate if too long (HTML is more valuable per byte, allow more) - const truncated = html.length > 16000 ? html.slice(0, 16000) : html; - console.log(` [Extract] Processing ${truncated.length} chars of HTML`); + const sanitized = sanitizeHtml(html); + const truncated = sanitized.length > 32000 ? sanitized.slice(0, 32000) : sanitized; + console.log(` [Extract] ${truncated.length} chars of HTML`); - const prompt = `You are an invoice data extractor. Extract the following fields from this HTML document (OCR output with semantic structure) and return ONLY a valid JSON object. - -The HTML uses semantic tags: -- with / for structured tables (invoice line items, totals) --
for document header (company info, invoice number) --
for document footer (payment terms, legal text) --
for table regions -- data-type and data-y attributes indicate block type and vertical position - -Required fields: -- invoice_number: The invoice/receipt/document number -- invoice_date: Date in YYYY-MM-DD format (convert from any format) -- vendor_name: Company that issued the invoice -- currency: EUR, USD, GBP, etc. -- net_amount: Amount before tax (number) -- vat_amount: Tax/VAT amount (number, use 0 if reverse charge or not shown) -- total_amount: Final total amount (number) - -Example output format: -{"invoice_number":"INV-123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99} - -Rules: -- Return ONLY the JSON object, no explanation or markdown -- Use null for missing string fields -- Use 0 for missing numeric fields -- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28") -- Extract numbers without currency symbols -- Look for totals in
sections, especially rows with "Total", "Amount Due", "Grand Total" - -HTML Document: -${truncated} - -JSON:`; - - const payload = { - model: TEXT_MODEL, - prompt, - stream: true, - options: { - num_predict: 512, - temperature: 0.1, + // JSON schema for structured output + const invoiceSchema = { + type: 'object', + properties: { + invoice_number: { type: 'string' }, + invoice_date: { type: 'string' }, + vendor_name: { type: 'string' }, + currency: { type: 'string' }, + net_amount: { type: 'number' }, + vat_amount: { type: 'number' }, + total_amount: { type: 'number' }, }, + required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], }; - const response = await fetch(`${OLLAMA_URL}/api/generate`, { + // Simple, direct prompt - the OCR output already has labeled fields + const systemPrompt = `You read invoice HTML and extract labeled fields. Return JSON only.`; + + const userPrompt = `Extract from this invoice HTML: +- invoice_number: Find "Invoice no.", "Invoice #", "Invoice", "Rechnung", "Document No" and extract the value +- invoice_date: Find "Invoice date", "Date", "Datum" and convert to YYYY-MM-DD format +- vendor_name: The company name issuing the invoice (in header/letterhead) +- currency: EUR, USD, or GBP (look for € $ £ symbols or text) +- total_amount: Find "Total", "Grand Total", "Amount Due", "Gesamtbetrag" - the FINAL total amount +- net_amount: Amount before VAT/tax (Subtotal, Net) +- vat_amount: VAT/tax amount + +HTML: +${truncated} + +Return ONLY valid JSON: {"invoice_number":"...", "invoice_date":"YYYY-MM-DD", "vendor_name":"...", "currency":"EUR", "net_amount":0, "vat_amount":0, "total_amount":0}`; + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), + body: JSON.stringify({ + model: TEXT_MODEL, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ], + format: invoiceSchema, + stream: true, + options: { num_predict: 512, temperature: 0.0 }, + }), }); if (!response.ok) { @@ -169,7 +179,9 @@ JSON:`; for (const line of lines) { try { const json = JSON.parse(line); - if (json.response) { + if (json.message?.content) { + fullText += json.message.content; + } else if (json.response) { fullText += json.response; } } catch { @@ -179,17 +191,37 @@ JSON:`; } // Extract JSON from response - const startIdx = fullText.indexOf('{'); - const endIdx = fullText.lastIndexOf('}') + 1; + let jsonStr = fullText.trim(); + + // Remove markdown code block if present + if (jsonStr.startsWith('```json')) { + jsonStr = jsonStr.slice(7); + } else if (jsonStr.startsWith('```')) { + jsonStr = jsonStr.slice(3); + } + if (jsonStr.endsWith('```')) { + jsonStr = jsonStr.slice(0, -3); + } + jsonStr = jsonStr.trim(); + + // Find JSON object boundaries + const startIdx = jsonStr.indexOf('{'); + const endIdx = jsonStr.lastIndexOf('}') + 1; if (startIdx < 0 || endIdx <= startIdx) { throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); } - const jsonStr = fullText.substring(startIdx, endIdx); - const parsed = JSON.parse(jsonStr); + jsonStr = jsonStr.substring(startIdx, endIdx); - // Ensure numeric fields are actually numbers + let parsed; + try { + parsed = JSON.parse(jsonStr); + } catch (e) { + throw new Error(`Invalid JSON: ${jsonStr.substring(0, 200)}`); + } + + // Normalize response to expected format return { invoice_number: parsed.invoice_number || null, invoice_date: parsed.invoice_date || null, @@ -203,14 +235,23 @@ JSON:`; /** * Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only) + * Processes ALL pages and concatenates HTML for multi-page invoice support */ async function extractOnce(images: string[], passNum: number): Promise { - // Parse document with full pipeline (PaddleOCR-VL) -> returns HTML - const html = await parseDocument(images[0]); - console.log(` [Parse] Got ${html.split('\n').length} lines of HTML`); + // Parse ALL pages and concatenate HTML with page markers + const htmlParts: string[] = []; + + for (let i = 0; i < images.length; i++) { + const pageHtml = await parseDocument(images[i]); + // Add page marker for context + htmlParts.push(`\n${pageHtml}`); + } + + const fullHtml = htmlParts.join('\n\n'); + console.log(` [Parse] Got ${fullHtml.split('\n').length} lines from ${images.length} page(s)`); // Extract invoice fields from HTML using text-only model (no images) - return extractInvoiceFromHtml(html); + return extractInvoiceFromHtml(fullHtml); } /**