feat(vision): process pages separately and make Qwen3-VL vision extraction more robust; add per-page parsing, safer JSON handling, reduced token usage, and multi-query invoice extraction

2026-01-18 04:50:57 +00:00
parent 63d72a52c9
commit e76768da55
3 changed files with 96 additions and 68 deletions
--- a/test/test.bankstatements.qwen3vl.ts
+++ b/test/test.bankstatements.qwen3vl.ts
@@ -53,23 +53,14 @@ function convertPdfToImages(pdfPath: string): string[] {
 }

 /**
- * Extract transactions using Qwen3-VL vision
+ * Extract transactions from a single page
+ * Processes one page at a time to minimize thinking tokens
 */
-async function extractTransactions(images: string[]): Promise<ITransaction[]> {
-  console.log(`    [Vision] Processing ${images.length} page(s) with Qwen3-VL`);
-
+async function extractTransactionsFromPage(image: string, pageNum: number): Promise<ITransaction[]> {
  const prompt = `/no_think
-Extract ALL transactions from this bank statement.
-
-Amount format:
- "- 21,47 €" = DEBIT = -21.47
- "+ 1.000,00 €" = CREDIT = 1000.00
- European format: comma is decimal separator
-
-For each transaction: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
-
-Return ONLY a JSON array, no explanation:
-[{"date":"...","counterparty":"...","amount":0},...]`;
+Extract transactions from this bank statement page.
+Amount: "- 21,47 €" = -21.47, "+ 1.000,00 €" = 1000.00 (European format)
+Return JSON array only: [{"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47},...]`;

  const response = await fetch(`${OLLAMA_URL}/api/chat`, {
    method: 'POST',
@@ -79,26 +70,28 @@ Return ONLY a JSON array, no explanation:
      messages: [{
        role: 'user',
        content: prompt,
-        images: images,
+        images: [image],
      }],
      stream: false,
      think: false,
      options: {
-        num_predict: 8000,
+        num_predict: 4000,
        temperature: 0.1,
      },
    }),
  });

  if (!response.ok) {
-    const err = await response.text();
-    throw new Error(`Ollama API error: ${response.status} - ${err}`);
+    throw new Error(`Ollama API error: ${response.status}`);
  }

  const data = await response.json();
  let content = data.message?.content || '';

-  console.log(`    [Vision] Got ${content.length} chars`);
+  if (!content) {
+    console.log(`    [Page ${pageNum}] Empty response`);
+    return [];
+  }

  // Parse JSON array
  if (content.startsWith('```json')) content = content.slice(7);
@@ -110,10 +103,37 @@ Return ONLY a JSON array, no explanation:
  const endIdx = content.lastIndexOf(']') + 1;

  if (startIdx < 0 || endIdx <= startIdx) {
-    throw new Error(`No JSON array found: ${content.substring(0, 300)}`);
+    console.log(`    [Page ${pageNum}] No JSON array found`);
+    return [];
  }

-  return JSON.parse(content.substring(startIdx, endIdx));
+  try {
+    const transactions = JSON.parse(content.substring(startIdx, endIdx));
+    console.log(`    [Page ${pageNum}] Found ${transactions.length} transactions`);
+    return transactions;
+  } catch {
+    console.log(`    [Page ${pageNum}] JSON parse error`);
+    return [];
+  }
+}
+
+/**
+ * Extract transactions using Qwen3-VL vision
+ * Processes each page separately to avoid thinking token exhaustion
+ */
+async function extractTransactions(images: string[]): Promise<ITransaction[]> {
+  console.log(`    [Vision] Processing ${images.length} page(s) with Qwen3-VL`);
+
+  const allTransactions: ITransaction[] = [];
+
+  // Process pages sequentially to avoid overwhelming the model
+  for (let i = 0; i < images.length; i++) {
+    const pageTransactions = await extractTransactionsFromPage(images[i], i + 1);
+    allTransactions.push(...pageTransactions);
+  }
+
+  console.log(`    [Vision] Total: ${allTransactions.length} transactions`);
+  return allTransactions;
 }

 /**