feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting
This commit is contained in:
@@ -36,8 +36,9 @@ function convertPdfToImages(pdfPath: string): string[] {
|
||||
const outputPattern = path.join(tempDir, 'page-%d.png');
|
||||
|
||||
try {
|
||||
// High quality conversion: 300 DPI, max quality, sharpen for better OCR
|
||||
execSync(
|
||||
`convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
|
||||
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove -sharpen 0x1 "${outputPattern}"`,
|
||||
{ stdio: 'pipe' }
|
||||
);
|
||||
|
||||
@@ -77,18 +78,35 @@ async function extractInvoiceFromImages(images: string[]): Promise<IInvoice> {
|
||||
required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'],
|
||||
};
|
||||
|
||||
const prompt = `Extract invoice data from this document image(s).
|
||||
const prompt = `You are an expert invoice data extraction system. Carefully analyze this invoice document and extract the following fields with high precision.
|
||||
|
||||
Find and return:
|
||||
- invoice_number: The invoice number/ID (look for "Invoice No", "Invoice #", "Rechnung Nr")
|
||||
- invoice_date: The invoice date in YYYY-MM-DD format
|
||||
- vendor_name: The company issuing the invoice (in letterhead)
|
||||
- currency: EUR, USD, or GBP
|
||||
- total_amount: The FINAL total amount due
|
||||
- net_amount: Amount before VAT/tax
|
||||
- vat_amount: VAT/tax amount
|
||||
INVOICE NUMBER:
|
||||
- Look for labels: "Invoice No", "Invoice #", "Invoice Number", "Rechnung Nr", "Rechnungsnummer", "Document No", "Bill No", "Reference"
|
||||
- Usually alphanumeric, often starts with letters (e.g., R0014359508, INV-2024-001)
|
||||
- Located near the top of the invoice
|
||||
|
||||
Return ONLY valid JSON.`;
|
||||
INVOICE DATE:
|
||||
- Look for labels: "Invoice Date", "Date", "Datum", "Rechnungsdatum", "Issue Date", "Bill Date"
|
||||
- Convert ANY date format to YYYY-MM-DD (e.g., 14/10/2021 → 2021-10-14, Oct 14, 2021 → 2021-10-14)
|
||||
- Usually near the invoice number
|
||||
|
||||
VENDOR NAME:
|
||||
- The company ISSUING the invoice (not the recipient)
|
||||
- Found in letterhead, logo area, or header - typically the largest/most prominent company name
|
||||
- Examples: "Hetzner Online GmbH", "Adobe Inc", "DigitalOcean LLC"
|
||||
|
||||
CURRENCY:
|
||||
- Detect from symbols: € = EUR, $ = USD, £ = GBP
|
||||
- Or from text: "EUR", "USD", "GBP"
|
||||
- Default to EUR if unclear
|
||||
|
||||
AMOUNTS (Critical - read carefully!):
|
||||
- total_amount: The FINAL amount due/payable - look for "Total", "Grand Total", "Amount Due", "Balance Due", "Gesamtbetrag", "Endbetrag"
|
||||
- net_amount: Subtotal BEFORE tax - look for "Subtotal", "Net", "Netto", "excl. VAT"
|
||||
- vat_amount: Tax amount - look for "VAT", "Tax", "MwSt", "USt", "19%", "20%"
|
||||
- For multi-page invoices: the FINAL totals are usually on the LAST page
|
||||
|
||||
Return ONLY valid JSON with the extracted values.`;
|
||||
|
||||
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||||
method: 'POST',
|
||||
@@ -105,7 +123,7 @@ Return ONLY valid JSON.`;
|
||||
format: invoiceSchema,
|
||||
stream: true,
|
||||
options: {
|
||||
num_predict: 512,
|
||||
num_predict: 1024,
|
||||
temperature: 0.0,
|
||||
},
|
||||
}),
|
||||
@@ -170,46 +188,6 @@ Return ONLY valid JSON.`;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract with consensus voting (2 agreeing passes)
|
||||
*/
|
||||
async function extractWithConsensus(images: string[], name: string, maxPasses: number = 3): Promise<IInvoice> {
|
||||
const results: Array<{ invoice: IInvoice; hash: string }> = [];
|
||||
const hashCounts: Map<string, number> = new Map();
|
||||
|
||||
for (let pass = 1; pass <= maxPasses; pass++) {
|
||||
try {
|
||||
const invoice = await extractInvoiceFromImages(images);
|
||||
const hash = `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount?.toFixed(2)}`;
|
||||
results.push({ invoice, hash });
|
||||
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
|
||||
|
||||
console.log(` [Pass ${pass}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
|
||||
|
||||
if (hashCounts.get(hash)! >= 2) {
|
||||
console.log(` [Consensus] Reached after ${pass} passes`);
|
||||
return invoice;
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(` [Pass ${pass}] Error: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Return most common result
|
||||
let bestHash = '';
|
||||
let bestCount = 0;
|
||||
for (const [hash, count] of hashCounts) {
|
||||
if (count > bestCount) {
|
||||
bestCount = count;
|
||||
bestHash = hash;
|
||||
}
|
||||
}
|
||||
|
||||
if (!bestHash) throw new Error(`No valid results for ${name}`);
|
||||
|
||||
console.log(` [No consensus] Using best result (${bestCount}/${maxPasses})`);
|
||||
return results.find((r) => r.hash === bestHash)!.invoice;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize date to YYYY-MM-DD
|
||||
@@ -314,7 +292,8 @@ for (const testCase of testCases) {
|
||||
const images = convertPdfToImages(testCase.pdfPath);
|
||||
console.log(` Pages: ${images.length}`);
|
||||
|
||||
const extracted = await extractWithConsensus(images, testCase.name);
|
||||
const extracted = await extractInvoiceFromImages(images);
|
||||
console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`);
|
||||
const elapsed = Date.now() - start;
|
||||
times.push(elapsed);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user