|
|
|
|
@@ -15,11 +15,12 @@ import * as fs from 'fs';
|
|
|
|
|
import * as path from 'path';
|
|
|
|
|
import { execSync } from 'child_process';
|
|
|
|
|
import * as os from 'os';
|
|
|
|
|
import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js';
|
|
|
|
|
import { ensurePaddleOcrVlFull, ensureQwen25 } from './helpers/docker.js';
|
|
|
|
|
|
|
|
|
|
const PADDLEOCR_VL_URL = 'http://localhost:8000';
|
|
|
|
|
const OLLAMA_URL = 'http://localhost:11434';
|
|
|
|
|
const MINICPM_MODEL = 'minicpm-v:latest';
|
|
|
|
|
// Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused)
|
|
|
|
|
const TEXT_MODEL = 'qwen2.5:7b';
|
|
|
|
|
|
|
|
|
|
interface IInvoice {
|
|
|
|
|
invoice_number: string;
|
|
|
|
|
@@ -87,42 +88,45 @@ async function parseDocument(imageBase64: string): Promise<string> {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extract invoice fields from structured Markdown using MiniCPM with image context
|
|
|
|
|
* Extract invoice fields from structured Markdown using Qwen2.5 (text-only model)
|
|
|
|
|
*/
|
|
|
|
|
async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise<IInvoice> {
|
|
|
|
|
async function extractInvoiceFromMarkdown(markdown: string): Promise<IInvoice> {
|
|
|
|
|
// Truncate if too long
|
|
|
|
|
const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown;
|
|
|
|
|
const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown;
|
|
|
|
|
console.log(` [Extract] Processing ${truncated.length} chars of Markdown`);
|
|
|
|
|
|
|
|
|
|
const prompt = `/nothink
|
|
|
|
|
You are an invoice parser. Extract fields from this invoice image.
|
|
|
|
|
const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object.
|
|
|
|
|
|
|
|
|
|
Required fields:
|
|
|
|
|
- invoice_number: The invoice/receipt number
|
|
|
|
|
- invoice_date: Date in YYYY-MM-DD format
|
|
|
|
|
- invoice_number: The invoice/receipt/document number
|
|
|
|
|
- invoice_date: Date in YYYY-MM-DD format (convert from any format)
|
|
|
|
|
- vendor_name: Company that issued the invoice
|
|
|
|
|
- currency: EUR, USD, etc.
|
|
|
|
|
- net_amount: Amount before tax
|
|
|
|
|
- vat_amount: Tax/VAT amount (0 if reverse charge)
|
|
|
|
|
- total_amount: Final amount due
|
|
|
|
|
- currency: EUR, USD, GBP, etc.
|
|
|
|
|
- net_amount: Amount before tax (number)
|
|
|
|
|
- vat_amount: Tax/VAT amount (number, use 0 if reverse charge or not shown)
|
|
|
|
|
- total_amount: Final total amount (number)
|
|
|
|
|
|
|
|
|
|
Return ONLY a JSON object like:
|
|
|
|
|
{"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99}
|
|
|
|
|
Example output format:
|
|
|
|
|
{"invoice_number":"INV-123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99}
|
|
|
|
|
|
|
|
|
|
Use null for missing strings, 0 for missing numbers. No explanation.
|
|
|
|
|
Rules:
|
|
|
|
|
- Return ONLY the JSON object, no explanation or markdown
|
|
|
|
|
- Use null for missing string fields
|
|
|
|
|
- Use 0 for missing numeric fields
|
|
|
|
|
- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28")
|
|
|
|
|
- Extract numbers without currency symbols
|
|
|
|
|
|
|
|
|
|
OCR text from the invoice (for reference):
|
|
|
|
|
---
|
|
|
|
|
OCR Text:
|
|
|
|
|
${truncated}
|
|
|
|
|
---`;
|
|
|
|
|
|
|
|
|
|
JSON:`;
|
|
|
|
|
|
|
|
|
|
const payload = {
|
|
|
|
|
model: MINICPM_MODEL,
|
|
|
|
|
model: TEXT_MODEL,
|
|
|
|
|
prompt,
|
|
|
|
|
images, // Send the actual image to MiniCPM
|
|
|
|
|
stream: true,
|
|
|
|
|
options: {
|
|
|
|
|
num_predict: 2048,
|
|
|
|
|
num_predict: 512,
|
|
|
|
|
temperature: 0.1,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
@@ -173,26 +177,41 @@ ${truncated}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const jsonStr = fullText.substring(startIdx, endIdx);
|
|
|
|
|
return JSON.parse(jsonStr);
|
|
|
|
|
const parsed = JSON.parse(jsonStr);
|
|
|
|
|
|
|
|
|
|
// Ensure numeric fields are actually numbers
|
|
|
|
|
return {
|
|
|
|
|
invoice_number: parsed.invoice_number || null,
|
|
|
|
|
invoice_date: parsed.invoice_date || null,
|
|
|
|
|
vendor_name: parsed.vendor_name || null,
|
|
|
|
|
currency: parsed.currency || 'EUR',
|
|
|
|
|
net_amount: parseFloat(parsed.net_amount) || 0,
|
|
|
|
|
vat_amount: parseFloat(parsed.vat_amount) || 0,
|
|
|
|
|
total_amount: parseFloat(parsed.total_amount) || 0,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM
|
|
|
|
|
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only)
|
|
|
|
|
*/
|
|
|
|
|
async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
|
|
|
|
|
// Parse document with full pipeline
|
|
|
|
|
// Parse document with full pipeline (PaddleOCR-VL)
|
|
|
|
|
const markdown = await parseDocument(images[0]);
|
|
|
|
|
console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
|
|
|
|
|
|
|
|
|
|
// Extract invoice fields from Markdown with image context
|
|
|
|
|
return extractInvoiceFromMarkdown(markdown, images);
|
|
|
|
|
// Extract invoice fields from Markdown using text-only model (no images)
|
|
|
|
|
return extractInvoiceFromMarkdown(markdown);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a hash of invoice for comparison (using key fields)
|
|
|
|
|
*/
|
|
|
|
|
function hashInvoice(invoice: IInvoice): string {
|
|
|
|
|
return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`;
|
|
|
|
|
// Ensure total_amount is a number
|
|
|
|
|
const amount = typeof invoice.total_amount === 'number'
|
|
|
|
|
? invoice.total_amount.toFixed(2)
|
|
|
|
|
: String(invoice.total_amount || 0);
|
|
|
|
|
return `${invoice.invoice_number}|${invoice.invoice_date}|${amount}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -243,6 +262,43 @@ async function extractWithConsensus(images: string[], invoiceName: string, maxPa
|
|
|
|
|
return best.invoice;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Normalize date to YYYY-MM-DD format
|
|
|
|
|
*/
|
|
|
|
|
function normalizeDate(dateStr: string | null): string {
|
|
|
|
|
if (!dateStr) return '';
|
|
|
|
|
|
|
|
|
|
// Already in correct format
|
|
|
|
|
if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) {
|
|
|
|
|
return dateStr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle DD-MMM-YYYY format (e.g., "28-JUN-2022")
|
|
|
|
|
const monthMap: Record<string, string> = {
|
|
|
|
|
JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06',
|
|
|
|
|
JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12',
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i);
|
|
|
|
|
if (match) {
|
|
|
|
|
const day = match[1].padStart(2, '0');
|
|
|
|
|
const month = monthMap[match[2].toUpperCase()] || '01';
|
|
|
|
|
const year = match[3];
|
|
|
|
|
return `${year}-${month}-${day}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle DD/MM/YYYY or DD.MM.YYYY
|
|
|
|
|
const match2 = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/);
|
|
|
|
|
if (match2) {
|
|
|
|
|
const day = match2[1].padStart(2, '0');
|
|
|
|
|
const month = match2[2].padStart(2, '0');
|
|
|
|
|
const year = match2[3];
|
|
|
|
|
return `${year}-${month}-${day}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dateStr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Compare extracted invoice against expected
|
|
|
|
|
*/
|
|
|
|
|
@@ -259,8 +315,10 @@ function compareInvoice(
|
|
|
|
|
errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compare date
|
|
|
|
|
if (extracted.invoice_date !== expected.invoice_date) {
|
|
|
|
|
// Compare date (normalize format first)
|
|
|
|
|
const extDate = normalizeDate(extracted.invoice_date);
|
|
|
|
|
const expDate = normalizeDate(expected.invoice_date);
|
|
|
|
|
if (extDate !== expDate) {
|
|
|
|
|
errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -317,9 +375,9 @@ tap.test('setup: ensure Docker containers are running', async () => {
|
|
|
|
|
const paddleOk = await ensurePaddleOcrVlFull();
|
|
|
|
|
expect(paddleOk).toBeTrue();
|
|
|
|
|
|
|
|
|
|
// Ensure MiniCPM is running (for field extraction from Markdown)
|
|
|
|
|
const minicpmOk = await ensureMiniCpm();
|
|
|
|
|
expect(minicpmOk).toBeTrue();
|
|
|
|
|
// Ensure Qwen2.5 is available (for text-only JSON extraction)
|
|
|
|
|
const qwenOk = await ensureQwen25();
|
|
|
|
|
expect(qwenOk).toBeTrue();
|
|
|
|
|
|
|
|
|
|
console.log('\n[Setup] All containers ready!\n');
|
|
|
|
|
});
|
|
|
|
|
@@ -380,7 +438,7 @@ tap.test('summary', async () => {
|
|
|
|
|
console.log(`\n======================================================`);
|
|
|
|
|
console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`);
|
|
|
|
|
console.log(`======================================================`);
|
|
|
|
|
console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`);
|
|
|
|
|
console.log(` Method: PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`);
|
|
|
|
|
console.log(` Passed: ${passedCount}/${totalInvoices}`);
|
|
|
|
|
console.log(` Failed: ${failedCount}/${totalInvoices}`);
|
|
|
|
|
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);
|
|
|
|
|
|