update

2026-01-16 16:21:44 +00:00
parent 3c5cf578a5
commit 15ac1fcf67
13 changed files with 873 additions and 805 deletions
--- a/test/test.node.ts
+++ b/test/test.node.ts
@@ -4,12 +4,16 @@ import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';

+// Service URLs
 const OLLAMA_URL = 'http://localhost:11434';
-const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';

-// Prompt for visual extraction (with images)
-const VISUAL_EXTRACT_PROMPT = `/nothink
+// Models
+const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
+
+// Prompt for MiniCPM-V visual extraction
+const MINICPM_EXTRACT_PROMPT = `/nothink
 You are a bank statement parser. Extract EVERY transaction from the table.

 Read the Amount column carefully:
@@ -21,9 +25,12 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}

 Do not skip any rows. Return ONLY the JSON array, no explanation.`;

-// Prompt for OCR-only extraction (no images)
-const OCR_EXTRACT_PROMPT = `/nothink
-You are a bank statement parser. Extract EVERY transaction from the OCR text below.
+// Prompt for PaddleOCR-VL table extraction
+const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
+
+// Post-processing prompt to convert PaddleOCR-VL output to JSON
+const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
+Convert the following bank statement table data to JSON.

 Read the Amount values carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
@@ -32,48 +39,12 @@ Read the Amount values carefully:

 For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}

-Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
+Return ONLY the JSON array, no explanation.

-/**
- * Build prompt for OCR-only extraction (no images)
- */
-function buildOcrOnlyPrompt(ocrText: string): string {
-  // Limit OCR text to prevent context overflow
-  const maxOcrLength = 12000;
-  const truncatedOcr = ocrText.length > maxOcrLength
-    ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
-    : ocrText;
-
-  return `${OCR_EXTRACT_PROMPT}
-
-OCR text from bank statement:
+Table data:
 ---
-${truncatedOcr}
+{TABLE_DATA}
 ---`;
-}
-
-/**
- * Extract OCR text from an image using PaddleOCR
- */
-async function extractOcrText(imageBase64: string): Promise<string> {
-  try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
-    });
-
-    if (!response.ok) return '';
-
-    const data = await response.json();
-    if (data.success && data.results) {
-      return data.results.map((r: { text: string }) => r.text).join('\n');
-    }
-  } catch {
-    // PaddleOCR unavailable
-  }
-  return '';
-}

 interface ITransaction {
  date: string;
@@ -94,7 +65,7 @@ function convertPdfToImages(pdfPath: string): string[] {
      { stdio: 'pipe' }
    );

-    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
+    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];

    for (const file of files) {
@@ -110,12 +81,12 @@ function convertPdfToImages(pdfPath: string): string[] {
 }

 /**
- * Visual extraction pass (with images)
+ * Extract using MiniCPM-V via Ollama
 */
-async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
+async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
  const payload = {
-    model: MODEL,
-    prompt: VISUAL_EXTRACT_PROMPT,
+    model: MINICPM_MODEL,
+    prompt: MINICPM_EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
@@ -124,31 +95,6 @@ async function extractVisual(images: string[], passLabel: string): Promise<ITran
    },
  };

-  return doExtraction(payload, passLabel);
-}
-
-/**
- * OCR-only extraction pass (no images, just text)
- */
-async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
-  const payload = {
-    model: MODEL,
-    prompt: buildOcrOnlyPrompt(ocrText),
-    stream: true,
-    options: {
-      num_predict: 16384,
-      temperature: 0.1,
-    },
-  };
-
-  return doExtraction(payload, passLabel);
-}
-
-/**
- * Common extraction logic
- */
-async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
-
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
@@ -168,7 +114,7 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
  let fullText = '';
  let lineBuffer = '';

-  console.log(`[${passLabel}] Extracting...`);
+  console.log(`[${passLabel}] Extracting with MiniCPM-V...`);

  while (true) {
    const { done, value } = await reader.read();
@@ -184,7 +130,6 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
          fullText += json.response;
          lineBuffer += json.response;

-          // Print complete lines
          if (lineBuffer.includes('\n')) {
            const parts = lineBuffer.split('\n');
            for (let i = 0; i < parts.length - 1; i++) {
@@ -214,6 +159,140 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }

+/**
+ * Extract table using PaddleOCR-VL via OpenAI-compatible API
+ */
+async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
+  const payload = {
+    model: PADDLEOCR_VL_MODEL,
+    messages: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'image_url',
+            image_url: { url: `data:image/png;base64,${imageBase64}` },
+          },
+          {
+            type: 'text',
+            text: PADDLEOCR_VL_TABLE_PROMPT,
+          },
+        ],
+      },
+    ],
+    temperature: 0.0,
+    max_tokens: 8192,
+  };
+
+  const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(payload),
+  });
+
+  if (!response.ok) {
+    const text = await response.text();
+    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
+  }
+
+  const data = await response.json();
+  return data.choices?.[0]?.message?.content || '';
+}
+
+/**
+ * Convert PaddleOCR-VL table output to transactions using MiniCPM-V
+ */
+async function convertTableToTransactions(
+  tableData: string,
+  passLabel: string
+): Promise<ITransaction[]> {
+  const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
+
+  const payload = {
+    model: MINICPM_MODEL,
+    prompt,
+    stream: true,
+    options: {
+      num_predict: 16384,
+      temperature: 0.1,
+    },
+  };
+
+  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(payload),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Ollama API error: ${response.status}`);
+  }
+
+  const reader = response.body?.getReader();
+  if (!reader) {
+    throw new Error('No response body');
+  }
+
+  const decoder = new TextDecoder();
+  let fullText = '';
+
+  console.log(`[${passLabel}] Converting table data to JSON...`);
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    const chunk = decoder.decode(value, { stream: true });
+    const lines = chunk.split('\n').filter((l) => l.trim());
+
+    for (const line of lines) {
+      try {
+        const json = JSON.parse(line);
+        if (json.response) {
+          fullText += json.response;
+        }
+      } catch {
+        // Skip invalid JSON lines
+      }
+    }
+  }
+
+  const startIdx = fullText.indexOf('[');
+  const endIdx = fullText.lastIndexOf(']') + 1;
+
+  if (startIdx < 0 || endIdx <= startIdx) {
+    throw new Error('No JSON array found in response');
+  }
+
+  return JSON.parse(fullText.substring(startIdx, endIdx));
+}
+
+/**
+ * Extract using PaddleOCR-VL (table recognition) + conversion
+ */
+async function extractWithPaddleOCRVL(
+  images: string[],
+  passLabel: string
+): Promise<ITransaction[]> {
+  console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
+
+  // Extract table data from each page
+  const tableDataParts: string[] = [];
+  for (let i = 0; i < images.length; i++) {
+    console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
+    const tableData = await extractTableWithPaddleOCRVL(images[i]);
+    if (tableData.trim()) {
+      tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
+    }
+  }
+
+  const combinedTableData = tableDataParts.join('\n\n');
+  console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
+
+  // Convert to transactions
+  return convertTableToTransactions(combinedTableData, passLabel);
+}
+
 /**
 * Create a hash of transactions for comparison
 */
@@ -225,10 +304,31 @@ function hashTransactions(transactions: ITransaction[]): string {
 }

 /**
- * Extract with majority voting - run until 2 passes match
- * Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
+ * Check if PaddleOCR-VL service is available
 */
-async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
+async function isPaddleOCRVLAvailable(): Promise<boolean> {
+  try {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
+      method: 'GET',
+      signal: AbortSignal.timeout(5000),
+    });
+    return response.ok;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Extract with dual-VLM consensus
+ * Strategy:
+ *   Pass 1 = MiniCPM-V visual extraction
+ *   Pass 2 = PaddleOCR-VL table recognition (if available)
+ *   Pass 3+ = MiniCPM-V visual (fallback)
+ */
+async function extractWithConsensus(
+  images: string[],
+  maxPasses: number = 5
+): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();

@@ -236,59 +336,48 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
    const hash = hashTransactions(transactions);
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
-    console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
+    console.log(
+      `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
+    );
    return hashCounts.get(hash)!;
  };

-  // Run Pass 1 (Visual) in parallel with OCR extraction
-  let ocrText = '';
-  const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));
-
-  // Extract OCR from all pages
-  const ocrPromise = (async () => {
-    const ocrTexts: string[] = [];
-    for (let i = 0; i < images.length; i++) {
-      const pageOcr = await extractOcrText(images[i]);
-      if (pageOcr) {
-        ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
-      }
-    }
-    ocrText = ocrTexts.join('\n\n');
-    if (ocrText) {
-      console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
-    }
-    return ocrText;
-  })();
-
-  // Wait for Pass 1 and OCR to complete
-  const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
-
-  // Process Pass 1 result
-  if ('error' in pass1Result) {
-    console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
+  // Check if PaddleOCR-VL is available
+  const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
+  if (paddleOCRVLAvailable) {
+    console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
  } else {
-    addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
+    console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
  }

-  // Pass 2: OCR-only (no images) - faster, different approach
-  if (ocrText) {
+  // Pass 1: MiniCPM-V visual extraction
+  try {
+    const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
+    addResult(pass1Result, 'Pass 1 MiniCPM-V');
+  } catch (err) {
+    console.log(`[Pass 1] Error: ${err}`);
+  }
+
+  // Pass 2: PaddleOCR-VL table recognition (if available)
+  if (paddleOCRVLAvailable) {
    try {
-      const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
-      const count = addResult(pass2Result, 'Pass 2 OCR-only');
+      const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
+      const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
      if (count >= 2) {
-        console.log(`[Consensus] Visual and OCR extractions match!`);
+        console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
        return pass2Result;
      }
    } catch (err) {
-      console.log(`[Pass 2 OCR-only] Error: ${err}`);
+      console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
    }
  }

-  // Continue with visual passes 3+ if no consensus yet
-  for (let pass = 3; pass <= maxPasses; pass++) {
+  // Pass 3+: Continue with MiniCPM-V visual passes
+  const startPass = paddleOCRVLAvailable ? 3 : 2;
+  for (let pass = startPass; pass <= maxPasses; pass++) {
    try {
-      const transactions = await extractVisual(images, `Pass ${pass} Visual`);
-      const count = addResult(transactions, `Pass ${pass} Visual`);
+      const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
+      const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);

      if (count >= 2) {
        console.log(`[Consensus] Reached after ${pass} passes`);
@@ -368,7 +457,7 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
  }

  const files = fs.readdirSync(testDir);
-  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
+  const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];

  for (const pdf of pdfFiles) {
@@ -402,6 +491,13 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });

+tap.test('should check PaddleOCR-VL availability', async () => {
+  const available = await isPaddleOCRVLAvailable();
+  console.log(`PaddleOCR-VL available: ${available}`);
+  // This test passes regardless - PaddleOCR-VL is optional
+  expect(true).toBeTrue();
+});
+
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 for (const testCase of testCases) {
@@ -416,7 +512,7 @@ for (const testCase of testCases) {
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`Converted: ${images.length} pages\n`);

-    // Extract with consensus voting
+    // Extract with dual-VLM consensus
    const extracted = await extractWithConsensus(images);
    console.log(`\nFinal: ${extracted.length} transactions`);