feat(docker-images): add vLLM-based Nanonets-OCR2-3B image, Qwen3-VL Ollama image and refactor build/docs/tests to use new runtime/layout

This commit is contained in:
2026-01-19 21:05:51 +00:00
parent b58bcabc76
commit 08728ada4d
14 changed files with 1492 additions and 1126 deletions

View File

@@ -1,7 +1,7 @@
/**
* Bank statement extraction using Nanonets-OCR-s + GPT-OSS 20B (sequential two-stage pipeline)
* Bank statement extraction using Nanonets-OCR2-3B + GPT-OSS 20B (sequential two-stage pipeline)
*
* Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion)
* Stage 1: Nanonets-OCR2-3B converts ALL document pages to markdown (stop after completion)
* Stage 2: GPT-OSS 20B extracts structured JSON from saved markdown (after Nanonets stops)
*
* This approach avoids GPU contention by running services sequentially.
@@ -14,7 +14,7 @@ import * as os from 'os';
import { ensureNanonetsOcr, ensureMiniCpm, removeContainer, isContainerRunning } from './helpers/docker.js';
const NANONETS_URL = 'http://localhost:8000/v1';
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s';
const NANONETS_MODEL = 'nanonets/Nanonets-OCR2-3B';
const OLLAMA_URL = 'http://localhost:11434';
const EXTRACTION_MODEL = 'gpt-oss:20b';
@@ -69,28 +69,11 @@ function estimateVisualTokens(width: number, height: number): number {
}
/**
* Batch images to fit within context window
* Process images one page at a time for reliability
*/
function batchImages(images: IImageData[]): IImageData[][] {
const batches: IImageData[][] = [];
let currentBatch: IImageData[] = [];
let currentTokens = 0;
for (const img of images) {
const imgTokens = estimateVisualTokens(img.width, img.height);
if (currentTokens + imgTokens > MAX_VISUAL_TOKENS && currentBatch.length > 0) {
batches.push(currentBatch);
currentBatch = [img];
currentTokens = imgTokens;
} else {
currentBatch.push(img);
currentTokens += imgTokens;
}
}
if (currentBatch.length > 0) batches.push(currentBatch);
return batches;
// One page per batch for reliable processing
return images.map(img => [img]);
}
/**
@@ -171,6 +154,7 @@ async function convertBatchToMarkdown(batch: IImageData[]): Promise<string> {
max_tokens: 4096 * batch.length, // Scale output tokens with batch size
temperature: 0.0,
}),
signal: AbortSignal.timeout(600000), // 10 minute timeout for OCR
});
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);