feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting

This commit is contained in:
2026-01-18 03:35:05 +00:00
parent d237ad19f4
commit 3780105c6f
6 changed files with 435 additions and 70 deletions

View File

@@ -36,8 +36,9 @@ function convertPdfToImages(pdfPath: string): string[] {
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
// High quality conversion: 300 DPI, max quality, sharpen for better OCR
execSync(
`convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove -sharpen 0x1 "${outputPattern}"`,
{ stdio: 'pipe' }
);
@@ -77,18 +78,35 @@ async function extractInvoiceFromImages(images: string[]): Promise<IInvoice> {
required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'],
};
const prompt = `Extract invoice data from this document image(s).
const prompt = `You are an expert invoice data extraction system. Carefully analyze this invoice document and extract the following fields with high precision.
Find and return:
- invoice_number: The invoice number/ID (look for "Invoice No", "Invoice #", "Rechnung Nr")
- invoice_date: The invoice date in YYYY-MM-DD format
- vendor_name: The company issuing the invoice (in letterhead)
- currency: EUR, USD, or GBP
- total_amount: The FINAL total amount due
- net_amount: Amount before VAT/tax
- vat_amount: VAT/tax amount
INVOICE NUMBER:
- Look for labels: "Invoice No", "Invoice #", "Invoice Number", "Rechnung Nr", "Rechnungsnummer", "Document No", "Bill No", "Reference"
- Usually alphanumeric, often starts with letters (e.g., R0014359508, INV-2024-001)
- Located near the top of the invoice
Return ONLY valid JSON.`;
INVOICE DATE:
- Look for labels: "Invoice Date", "Date", "Datum", "Rechnungsdatum", "Issue Date", "Bill Date"
- Convert ANY date format to YYYY-MM-DD (e.g., 14/10/2021 → 2021-10-14, Oct 14, 2021 → 2021-10-14)
- Usually near the invoice number
VENDOR NAME:
- The company ISSUING the invoice (not the recipient)
- Found in letterhead, logo area, or header - typically the largest/most prominent company name
- Examples: "Hetzner Online GmbH", "Adobe Inc", "DigitalOcean LLC"
CURRENCY:
- Detect from symbols: € = EUR, $ = USD, £ = GBP
- Or from text: "EUR", "USD", "GBP"
- Default to EUR if unclear
AMOUNTS (Critical - read carefully!):
- total_amount: The FINAL amount due/payable - look for "Total", "Grand Total", "Amount Due", "Balance Due", "Gesamtbetrag", "Endbetrag"
- net_amount: Subtotal BEFORE tax - look for "Subtotal", "Net", "Netto", "excl. VAT"
- vat_amount: Tax amount - look for "VAT", "Tax", "MwSt", "USt", "19%", "20%"
- For multi-page invoices: the FINAL totals are usually on the LAST page
Return ONLY valid JSON with the extracted values.`;
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
@@ -105,7 +123,7 @@ Return ONLY valid JSON.`;
format: invoiceSchema,
stream: true,
options: {
num_predict: 512,
num_predict: 1024,
temperature: 0.0,
},
}),
@@ -170,46 +188,6 @@ Return ONLY valid JSON.`;
};
}
/**
* Extract with consensus voting (2 agreeing passes)
*/
async function extractWithConsensus(images: string[], name: string, maxPasses: number = 3): Promise<IInvoice> {
const results: Array<{ invoice: IInvoice; hash: string }> = [];
const hashCounts: Map<string, number> = new Map();
for (let pass = 1; pass <= maxPasses; pass++) {
try {
const invoice = await extractInvoiceFromImages(images);
const hash = `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount?.toFixed(2)}`;
results.push({ invoice, hash });
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(` [Pass ${pass}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
if (hashCounts.get(hash)! >= 2) {
console.log(` [Consensus] Reached after ${pass} passes`);
return invoice;
}
} catch (err) {
console.log(` [Pass ${pass}] Error: ${err}`);
}
}
// Return most common result
let bestHash = '';
let bestCount = 0;
for (const [hash, count] of hashCounts) {
if (count > bestCount) {
bestCount = count;
bestHash = hash;
}
}
if (!bestHash) throw new Error(`No valid results for ${name}`);
console.log(` [No consensus] Using best result (${bestCount}/${maxPasses})`);
return results.find((r) => r.hash === bestHash)!.invoice;
}
/**
* Normalize date to YYYY-MM-DD
@@ -314,7 +292,8 @@ for (const testCase of testCases) {
const images = convertPdfToImages(testCase.pdfPath);
console.log(` Pages: ${images.length}`);
const extracted = await extractWithConsensus(images, testCase.name);
const extracted = await extractInvoiceFromImages(images);
console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`);
const elapsed = Date.now() - start;
times.push(elapsed);