feat(docker-images): add vLLM-based Nanonets-OCR2-3B image, Qwen3-VL Ollama image and refactor build/docs/tests to use new runtime/layout

2026-01-19 21:05:51 +00:00
parent b58bcabc76
commit 08728ada4d
14 changed files with 1492 additions and 1126 deletions
--- a/test/test.invoices.nanonets.ts
+++ b/test/test.invoices.nanonets.ts
@@ -1,7 +1,7 @@
 /**
- * Invoice extraction using Nanonets-OCR-s + GPT-OSS 20B (sequential two-stage pipeline)
+ * Invoice extraction using Nanonets-OCR2-3B + GPT-OSS 20B (sequential two-stage pipeline)
 *
- * Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion)
+ * Stage 1: Nanonets-OCR2-3B converts ALL document pages to markdown (stop after completion)
 * Stage 2: GPT-OSS 20B extracts structured JSON from saved markdown (after Nanonets stops)
 *
 * This approach avoids GPU contention by running services sequentially.
@@ -14,7 +14,7 @@ import * as os from 'os';
 import { ensureNanonetsOcr, ensureMiniCpm, isContainerRunning } from './helpers/docker.js';

 const NANONETS_URL = 'http://localhost:8000/v1';
-const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s';
+const NANONETS_MODEL = 'nanonets/Nanonets-OCR2-3B';

 const OLLAMA_URL = 'http://localhost:11434';
 const EXTRACTION_MODEL = 'gpt-oss:20b';
@@ -92,28 +92,11 @@ function estimateVisualTokens(width: number, height: number): number {
 }

 /**
- * Batch images to fit within context window
+ * Process images one page at a time for reliability
 */
 function batchImages(images: IImageData[]): IImageData[][] {
-  const batches: IImageData[][] = [];
-  let currentBatch: IImageData[] = [];
-  let currentTokens = 0;
-
-  for (const img of images) {
-    const imgTokens = estimateVisualTokens(img.width, img.height);
-
-    if (currentTokens + imgTokens > MAX_VISUAL_TOKENS && currentBatch.length > 0) {
-      batches.push(currentBatch);
-      currentBatch = [img];
-      currentTokens = imgTokens;
-    } else {
-      currentBatch.push(img);
-      currentTokens += imgTokens;
-    }
-  }
-  if (currentBatch.length > 0) batches.push(currentBatch);
-
-  return batches;
+  // One page per batch for reliable processing
+  return images.map(img => [img]);
 }

 /**
@@ -194,6 +177,7 @@ async function convertBatchToMarkdown(batch: IImageData[]): Promise<string> {
      max_tokens: 4096 * batch.length,  // Scale output tokens with batch size
      temperature: 0.0,
    }),
+    signal: AbortSignal.timeout(600000),  // 10 minute timeout for OCR
  });

  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);