feat(invoices): add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors

2026-01-16 14:24:37 +00:00
parent acded2a165
commit 82358b2d5d
4 changed files with 380 additions and 109 deletions
--- a/test/test.node.ts
+++ b/test/test.node.ts
@@ -6,8 +6,11 @@ import * as os from 'os';

 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const PADDLEOCR_URL = 'http://localhost:5000';

-const EXTRACT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
+// Prompt for visual extraction (with images)
+const VISUAL_EXTRACT_PROMPT = `/nothink
+You are a bank statement parser. Extract EVERY transaction from the table.

 Read the Amount column carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
@@ -18,6 +21,60 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}

 Do not skip any rows. Return ONLY the JSON array, no explanation.`;

+// Prompt for OCR-only extraction (no images)
+const OCR_EXTRACT_PROMPT = `/nothink
+You are a bank statement parser. Extract EVERY transaction from the OCR text below.
+
+Read the Amount values carefully:
+- "- 21,47 €" means DEBIT, output as: -21.47
+- "+ 1.000,00 €" means CREDIT, output as: 1000.00
+- European format: comma = decimal point
+
+For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
+
+Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
+
+/**
+ * Build prompt for OCR-only extraction (no images)
+ */
+function buildOcrOnlyPrompt(ocrText: string): string {
+  // Limit OCR text to prevent context overflow
+  const maxOcrLength = 12000;
+  const truncatedOcr = ocrText.length > maxOcrLength
+    ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
+    : ocrText;
+
+  return `${OCR_EXTRACT_PROMPT}
+
+OCR text from bank statement:
+---
+${truncatedOcr}
+---`;
+}
+
+/**
+ * Extract OCR text from an image using PaddleOCR
+ */
+async function extractOcrText(imageBase64: string): Promise<string> {
+  try {
+    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ image: imageBase64 }),
+    });
+
+    if (!response.ok) return '';
+
+    const data = await response.json();
+    if (data.success && data.results) {
+      return data.results.map((r: { text: string }) => r.text).join('\n');
+    }
+  } catch {
+    // PaddleOCR unavailable
+  }
+  return '';
+}
+
 interface ITransaction {
  date: string;
  counterparty: string;
@@ -53,12 +110,12 @@ function convertPdfToImages(pdfPath: string): string[] {
 }

 /**
- * Single extraction pass
+ * Visual extraction pass (with images)
 */
-async function extractOnce(images: string[], passNum: number): Promise<ITransaction[]> {
+async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
  const payload = {
    model: MODEL,
-    prompt: EXTRACT_PROMPT,
+    prompt: VISUAL_EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
@@ -67,6 +124,31 @@ async function extractOnce(images: string[], passNum: number): Promise<ITransact
    },
  };

+  return doExtraction(payload, passLabel);
+}
+
+/**
+ * OCR-only extraction pass (no images, just text)
+ */
+async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
+  const payload = {
+    model: MODEL,
+    prompt: buildOcrOnlyPrompt(ocrText),
+    stream: true,
+    options: {
+      num_predict: 16384,
+      temperature: 0.1,
+    },
+  };
+
+  return doExtraction(payload, passLabel);
+}
+
+/**
+ * Common extraction logic
+ */
+async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
+
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
@@ -86,7 +168,7 @@ async function extractOnce(images: string[], passNum: number): Promise<ITransact
  let fullText = '';
  let lineBuffer = '';

-  console.log(`[Pass ${passNum}] Extracting...`);
+  console.log(`[${passLabel}] Extracting...`);

  while (true) {
    const { done, value } = await reader.read();
@@ -144,30 +226,78 @@ function hashTransactions(transactions: ITransaction[]): string {

 /**
 * Extract with majority voting - run until 2 passes match
+ * Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
 */
 async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();

-  for (let pass = 1; pass <= maxPasses; pass++) {
-    const transactions = await extractOnce(images, pass);
+  const addResult = (transactions: ITransaction[], passLabel: string): number => {
    const hash = hashTransactions(transactions);
-
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
+    console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
+    return hashCounts.get(hash)!;
+  };

-    console.log(`[Pass ${pass}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
+  // Run Pass 1 (Visual) in parallel with OCR extraction
+  let ocrText = '';
+  const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));

-    // Check if we have consensus (2+ matching)
-    const count = hashCounts.get(hash)!;
-    if (count >= 2) {
-      console.log(`[Consensus] Reached after ${pass} passes (${count} matching results)`);
-      return transactions;
+  // Extract OCR from all pages
+  const ocrPromise = (async () => {
+    const ocrTexts: string[] = [];
+    for (let i = 0; i < images.length; i++) {
+      const pageOcr = await extractOcrText(images[i]);
+      if (pageOcr) {
+        ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
+      }
    }
+    ocrText = ocrTexts.join('\n\n');
+    if (ocrText) {
+      console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
+    }
+    return ocrText;
+  })();
+
+  // Wait for Pass 1 and OCR to complete
+  const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
+
+  // Process Pass 1 result
+  if ('error' in pass1Result) {
+    console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
+  } else {
+    addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
+  }
+
+  // Pass 2: OCR-only (no images) - faster, different approach
+  if (ocrText) {
+    try {
+      const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
+      const count = addResult(pass2Result, 'Pass 2 OCR-only');
+      if (count >= 2) {
+        console.log(`[Consensus] Visual and OCR extractions match!`);
+        return pass2Result;
+      }
+    } catch (err) {
+      console.log(`[Pass 2 OCR-only] Error: ${err}`);
+    }
+  }
+
+  // Continue with visual passes 3+ if no consensus yet
+  for (let pass = 3; pass <= maxPasses; pass++) {
+    try {
+      const transactions = await extractVisual(images, `Pass ${pass} Visual`);
+      const count = addResult(transactions, `Pass ${pass} Visual`);
+
+      if (count >= 2) {
+        console.log(`[Consensus] Reached after ${pass} passes`);
+        return transactions;
+      }

-    // After 2 passes, if no match yet, continue
-    if (pass >= 2) {
      console.log(`[Pass ${pass}] No consensus yet, trying again...`);
+    } catch (err) {
+      console.log(`[Pass ${pass}] Error: ${err}`);
    }
  }

@@ -181,6 +311,10 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
    }
  }

+  if (!bestHash) {
+    throw new Error('No valid results obtained');
+  }
+
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.transactions;