From 09ea7440e8de209563be03c54c7b487aa4875417 Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sun, 18 Jan 2026 15:54:16 +0000 Subject: [PATCH] update --- Dockerfile_nanonets_ocr | 33 ++ readme.hints.md | 89 ++++ test/helpers/docker.ts | 25 ++ test/test.bankstatements.nanonets.ts | 530 ++++++++++++++++++++++++ test/test.invoices.nanonets.ts | 584 +++++++++++++++++++++++++++ 5 files changed, 1261 insertions(+) create mode 100644 Dockerfile_nanonets_ocr create mode 100644 test/test.bankstatements.nanonets.ts create mode 100644 test/test.invoices.nanonets.ts diff --git a/Dockerfile_nanonets_ocr b/Dockerfile_nanonets_ocr new file mode 100644 index 0000000..01c378e --- /dev/null +++ b/Dockerfile_nanonets_ocr @@ -0,0 +1,33 @@ +# Nanonets-OCR-s Vision Language Model +# Based on Qwen2.5-VL-3B, fine-tuned for document OCR +# ~8-10GB VRAM, outputs structured markdown with semantic tags +# +# Build: docker build -f Dockerfile_nanonets_ocr -t nanonets-ocr . +# Run: docker run --gpus all -p 8000:8000 -v ht-huggingface-cache:/root/.cache/huggingface nanonets-ocr + +FROM vllm/vllm-openai:latest + +LABEL maintainer="Task Venture Capital GmbH " +LABEL description="Nanonets-OCR-s - Document OCR optimized Vision Language Model" +LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" + +# Environment configuration +ENV MODEL_NAME="nanonets/Nanonets-OCR-s" +ENV HOST="0.0.0.0" +ENV PORT="8000" +ENV MAX_MODEL_LEN="8192" +ENV GPU_MEMORY_UTILIZATION="0.9" + +# Expose OpenAI-compatible API port +EXPOSE 8000 + +# Health check - vLLM exposes /health endpoint +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Start vLLM server with Nanonets-OCR-s model +CMD ["--model", "nanonets/Nanonets-OCR-s", \ + "--trust-remote-code", \ + "--max-model-len", "8192", \ + "--host", "0.0.0.0", \ + "--port", "8000"] diff --git a/readme.hints.md b/readme.hints.md index 0ae6085..f673fb9 100644 --- a/readme.hints.md +++ b/readme.hints.md @@ -244,8 +244,97 @@ The bank statement extraction uses a dual-VLM consensus approach: --- +## Nanonets-OCR-s + +### Overview + +Nanonets-OCR-s is a Qwen2.5-VL-3B model fine-tuned specifically for document OCR tasks. It outputs structured markdown with semantic tags. + +**Key features:** +- Based on Qwen2.5-VL-3B (~4B parameters) +- Fine-tuned for document OCR +- Outputs markdown with semantic HTML tags +- ~8-10GB VRAM (fits comfortably in 16GB) + +### Docker Images + +| Tag | Description | +|-----|-------------| +| `nanonets-ocr` | GPU variant using vLLM (OpenAI-compatible API) | + +### API Endpoints (OpenAI-compatible via vLLM) + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check | +| `/v1/models` | GET | List available models | +| `/v1/chat/completions` | POST | OpenAI-compatible chat completions | + +### Request/Response Format + +**POST /v1/chat/completions (OpenAI-compatible)** +```json +{ + "model": "nanonets/Nanonets-OCR-s", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, + {"type": "text", "text": "Extract the text from the above document..."} + ] + } + ], + "temperature": 0.0, + "max_tokens": 4096 +} +``` + +### Nanonets OCR Prompt + +The model is designed to work with a specific prompt format: +``` +Extract the text from the above document as if you were reading it naturally. +Return the tables in html format. +Return the equations in LaTeX representation. +If there is an image in the document and image caption is not present, add a small description inside tag. +Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. +Page numbers should be wrapped in brackets. Ex: 14. +``` + +### Performance + +- **GPU (vLLM)**: ~3-8 seconds per page +- **VRAM usage**: ~8-10GB + +### Two-Stage Pipeline (Nanonets + Qwen3) + +The Nanonets tests use a two-stage pipeline: +1. **Stage 1**: Nanonets-OCR-s converts images to markdown (via vLLM on port 8000) +2. **Stage 2**: Qwen3 8B extracts structured JSON from markdown (via Ollama on port 11434) + +**GPU Limitation**: Both vLLM and Ollama require significant GPU memory. On a single GPU system: +- Running both simultaneously causes memory contention +- For single GPU: Run services sequentially (stop Nanonets before Qwen3) +- For multi-GPU: Assign each service to a different GPU + +**Sequential Execution**: +```bash +# Step 1: Run Nanonets OCR (converts to markdown) +docker start nanonets-test +# ... perform OCR ... +docker stop nanonets-test + +# Step 2: Run Qwen3 extraction (from markdown) +docker start minicpm-test +# ... extract JSON ... +``` + +--- + ## Related Resources - [Ollama Documentation](https://ollama.ai/docs) - [MiniCPM-V GitHub](https://github.com/OpenBMB/MiniCPM-V) - [Ollama API Reference](https://github.com/ollama/ollama/blob/main/docs/api.md) +- [Nanonets-OCR-s on HuggingFace](https://huggingface.co/nanonets/Nanonets-OCR-s) diff --git a/test/helpers/docker.ts b/test/helpers/docker.ts index 9fc7703..7c8f77c 100644 --- a/test/helpers/docker.ts +++ b/test/helpers/docker.ts @@ -7,6 +7,7 @@ const PROJECT_CONTAINERS = [ 'paddleocr-vl-cpu-test', 'paddleocr-vl-full-test', 'minicpm-test', + 'nanonets-test', ]; // Image configurations @@ -74,6 +75,19 @@ export const IMAGES = { healthEndpoint: 'http://localhost:8000/health', healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2) } as IImageConfig, + + // Nanonets-OCR-s - Document OCR optimized VLM (Qwen2.5-VL-3B fine-tuned) + nanonetsOcr: { + name: 'nanonets-ocr', + dockerfile: 'Dockerfile_nanonets_ocr', + buildContext: '.', + containerName: 'nanonets-test', + ports: ['8000:8000'], + volumes: ['ht-huggingface-cache:/root/.cache/huggingface'], + gpus: true, + healthEndpoint: 'http://localhost:8000/health', + healthTimeout: 300000, // 5 minutes for model loading + } as IImageConfig, }; /** @@ -383,3 +397,14 @@ export async function ensureQwen3Vl(): Promise { // Then ensure Qwen3-VL 8B is pulled return ensureOllamaModel('qwen3-vl:8b'); } + +/** + * Ensure Nanonets-OCR-s service is running (via vLLM) + * Document OCR optimized VLM based on Qwen2.5-VL-3B + */ +export async function ensureNanonetsOcr(): Promise { + if (!isGpuAvailable()) { + console.log('[Docker] WARNING: Nanonets-OCR-s requires GPU, but none detected'); + } + return ensureService(IMAGES.nanonetsOcr); +} diff --git a/test/test.bankstatements.nanonets.ts b/test/test.bankstatements.nanonets.ts new file mode 100644 index 0000000..287b891 --- /dev/null +++ b/test/test.bankstatements.nanonets.ts @@ -0,0 +1,530 @@ +/** + * Bank statement extraction using Nanonets-OCR-s + Qwen3 (two-stage pipeline) + * + * Stage 1: Nanonets-OCR-s converts document pages to markdown (its strength) + * Stage 2: Qwen3 extracts structured JSON from the combined markdown + * + * This leverages each model's strengths: + * - Nanonets: Document OCR with semantic tags + * - Qwen3: Text understanding and JSON extraction + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureNanonetsOcr, ensureMiniCpm } from './helpers/docker.js'; + +const NANONETS_URL = 'http://localhost:8000/v1'; +const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s'; + +const OLLAMA_URL = 'http://localhost:11434'; +const QWEN_MODEL = 'qwen3:8b'; + +interface ITransaction { + date: string; + counterparty: string; + amount: number; +} + +// Nanonets-specific prompt for document OCR to markdown +const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally. +Return the tables in html format. +Return the equations in LaTeX representation. +If there is an image in the document and image caption is not present, add a small description inside tag. +Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. +Page numbers should be wrapped in brackets. Ex: 14.`; + +// JSON extraction prompt for Qwen3 +const JSON_EXTRACTION_PROMPT = `You are a financial data extractor. Below is a bank statement converted to text/markdown. Extract ALL transactions from it as a JSON array. + +IMPORTANT RULES: +1. Each transaction has: date, description/counterparty, and an amount +2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals) +3. Amount is POSITIVE for money coming IN (credits, deposits, refunds) +4. Date format: YYYY-MM-DD +5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows +6. Only include actual transactions with a specific date and amount + +Return ONLY this JSON format, no explanation: +[ + {"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99}, + {"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00} +] + +BANK STATEMENT TEXT: +`; + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + // Use 150 DPI to keep images within model's context length + execSync( + `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Stage 1: Convert a single page to markdown using Nanonets-OCR-s + */ +async function convertPageToMarkdown(image: string, pageNum: number): Promise { + console.log(` [Nanonets] Converting page ${pageNum} to markdown...`); + const startTime = Date.now(); + + const response = await fetch(`${NANONETS_URL}/chat/completions`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer dummy', + }, + body: JSON.stringify({ + model: NANONETS_MODEL, + messages: [{ + role: 'user', + content: [ + { type: 'image_url', image_url: { url: `data:image/png;base64,${image}` }}, + { type: 'text', text: NANONETS_OCR_PROMPT }, + ], + }], + max_tokens: 4096, + temperature: 0.0, + }), + }); + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + + if (!response.ok) { + const errorText = await response.text(); + console.log(` [Nanonets] ERROR page ${pageNum}: ${response.status} - ${errorText}`); + throw new Error(`Nanonets API error: ${response.status}`); + } + + const data = await response.json(); + const content = (data.choices?.[0]?.message?.content || '').trim(); + console.log(` [Nanonets] Page ${pageNum} converted (${elapsed}s, ${content.length} chars)`); + return content; +} + +/** + * Stage 1: Convert all pages to markdown using Nanonets-OCR-s + */ +async function convertDocumentToMarkdown(images: string[]): Promise { + console.log(` [Stage 1] Converting ${images.length} page(s) to markdown with Nanonets-OCR-s...`); + + const markdownPages: string[] = []; + + for (let i = 0; i < images.length; i++) { + const markdown = await convertPageToMarkdown(images[i], i + 1); + markdownPages.push(`--- PAGE ${i + 1} ---\n${markdown}`); + } + + const fullMarkdown = markdownPages.join('\n\n'); + console.log(` [Stage 1] Complete: ${fullMarkdown.length} chars total`); + return fullMarkdown; +} + +/** + * Ensure Qwen3 model is available + */ +async function ensureQwen3(): Promise { + try { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + if (response.ok) { + const data = await response.json(); + const models = data.models || []; + if (models.some((m: { name: string }) => m.name === QWEN_MODEL)) { + console.log(` [Ollama] Model available: ${QWEN_MODEL}`); + return true; + } + } + } catch { + return false; + } + + console.log(` [Ollama] Pulling ${QWEN_MODEL}...`); + const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ name: QWEN_MODEL, stream: false }), + }); + + return pullResponse.ok; +} + +/** + * Stage 2: Extract transactions from markdown using Qwen3 + */ +async function extractTransactionsFromMarkdown(markdown: string, queryId: string): Promise { + console.log(` [${queryId}] Sending markdown to ${QWEN_MODEL}...`); + const startTime = Date.now(); + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: QWEN_MODEL, + messages: [{ + role: 'user', + content: JSON_EXTRACTION_PROMPT + markdown, + }], + stream: false, + options: { + num_predict: 8000, + temperature: 0.1, + }, + }), + }); + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + + if (!response.ok) { + console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`); + throw new Error(`Ollama API error: ${response.status}`); + } + + const data = await response.json(); + const content = (data.message?.content || '').trim(); + console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`); + + return parseJsonResponse(content, queryId); +} + +/** + * Sanitize JSON string - fix common issues from LLM output + */ +function sanitizeJson(jsonStr: string): string { + let s = jsonStr; + + // Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix + s = s.replace(/"amount"\s*:\s*\+/g, '"amount": '); + s = s.replace(/:\s*\+(\d)/g, ': $1'); + + // Fix European number format with thousands separator + s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4'); + + // Fix trailing commas before ] or } + s = s.replace(/,\s*([}\]])/g, '$1'); + + // Fix unescaped newlines/tabs inside strings + s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"'); + s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"'); + + // Remove control characters + s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' '); + + return s; +} + +/** + * Parse amount from various formats + */ +function parseAmount(value: unknown): number { + if (typeof value === 'number') return value; + if (typeof value !== 'string') return 0; + + let s = value.replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); + // European format: comma is decimal + if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) { + s = s.replace(/\./g, '').replace(',', '.'); + } else { + s = s.replace(/,/g, ''); + } + return parseFloat(s) || 0; +} + +/** + * Parse JSON response into transactions + */ +function parseJsonResponse(response: string, queryId: string): ITransaction[] { + console.log(` [${queryId}] Parsing response...`); + + // Remove thinking tags if present (Qwen3 may include ...) + let cleanResponse = response.replace(/[\s\S]*?<\/think>/g, '').trim(); + + // Try to find JSON in markdown code block + const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/); + let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse; + + // Sanitize JSON + jsonStr = sanitizeJson(jsonStr); + + try { + const parsed = JSON.parse(jsonStr); + if (Array.isArray(parsed)) { + const txs = parsed.map(tx => ({ + date: String(tx.date || ''), + counterparty: String(tx.counterparty || tx.description || ''), + amount: parseAmount(tx.amount), + })); + console.log(` [${queryId}] Parsed ${txs.length} transactions`); + return txs; + } + } catch (e) { + console.log(` [${queryId}] Direct parse failed: ${(e as Error).message}`); + + // Try to find JSON array pattern + const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); + if (arrayMatch) { + try { + const parsed = JSON.parse(sanitizeJson(arrayMatch[0])); + if (Array.isArray(parsed)) { + const txs = parsed.map(tx => ({ + date: String(tx.date || ''), + counterparty: String(tx.counterparty || tx.description || ''), + amount: parseAmount(tx.amount), + })); + console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`); + return txs; + } + } catch (e2) { + console.log(` [${queryId}] Array parse failed: ${(e2 as Error).message}`); + } + } + } + + console.log(` [${queryId}] PARSE FAILED - returning empty array`); + return []; +} + +/** + * Compare two transaction arrays for consensus + */ +function transactionArraysMatch(a: ITransaction[], b: ITransaction[]): boolean { + if (a.length !== b.length) return false; + + for (let i = 0; i < a.length; i++) { + const dateMatch = a[i].date === b[i].date; + const amountMatch = Math.abs(a[i].amount - b[i].amount) < 0.01; + if (!dateMatch || !amountMatch) return false; + } + + return true; +} + +/** + * Stage 2: Extract transactions using Qwen3 with consensus + */ +async function extractWithConsensus(markdown: string): Promise { + const MAX_ATTEMPTS = 3; + console.log(` [Stage 2] Extracting transactions with ${QWEN_MODEL} (consensus)...`); + + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + console.log(`\n [Stage 2] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`); + + // Extract twice in parallel + const [txs1, txs2] = await Promise.all([ + extractTransactionsFromMarkdown(markdown, `A${attempt}Q1`), + extractTransactionsFromMarkdown(markdown, `A${attempt}Q2`), + ]); + + console.log(` [Stage 2] Results: Q1=${txs1.length} txs, Q2=${txs2.length} txs`); + + if (txs1.length > 0 && transactionArraysMatch(txs1, txs2)) { + console.log(` [Stage 2] CONSENSUS REACHED: ${txs1.length} transactions`); + return txs1; + } + + console.log(` [Stage 2] NO CONSENSUS`); + } + + // Fallback: use last response + console.log(`\n [Stage 2] === FALLBACK ===`); + const fallback = await extractTransactionsFromMarkdown(markdown, 'FALLBACK'); + console.log(` [Stage 2] ~ FALLBACK RESULT: ${fallback.length} transactions`); + return fallback; +} + +/** + * Full pipeline: PDF -> Images -> Markdown -> JSON + */ +async function extractTransactions(images: string[]): Promise { + // Stage 1: Convert to markdown + const markdown = await convertDocumentToMarkdown(images); + + // Stage 2: Extract transactions with consensus + const transactions = await extractWithConsensus(markdown); + + // Log all transactions + console.log(`\n [Result] Extracted ${transactions.length} transactions:`); + for (let i = 0; i < transactions.length; i++) { + const tx = transactions[i]; + console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); + } + + return transactions; +} + +/** + * Compare extracted transactions against expected + */ +function compareTransactions( + extracted: ITransaction[], + expected: ITransaction[] +): { matches: number; total: number; errors: string[]; variations: string[] } { + const errors: string[] = []; + const variations: string[] = []; + let matches = 0; + + for (let i = 0; i < expected.length; i++) { + const exp = expected[i]; + const ext = extracted[i]; + + if (!ext) { + errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); + continue; + } + + const dateMatch = ext.date === exp.date; + const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; + + if (dateMatch && amountMatch) { + matches++; + if (ext.counterparty !== exp.counterparty) { + variations.push(`[${i}] "${exp.counterparty}" -> "${ext.counterparty}"`); + } + } else { + errors.push(`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`); + } + } + + if (extracted.length > expected.length) { + errors.push(`Extra transactions: ${extracted.length - expected.length}`); + } + + return { matches, total: expected.length, errors, variations }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + return testCases.sort((a, b) => a.name.localeCompare(b.name)); +} + +// Tests + +tap.test('setup: ensure containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Nanonets for OCR + const nanonetsOk = await ensureNanonetsOcr(); + expect(nanonetsOk).toBeTrue(); + + // Ollama for Qwen3 + const ollamaOk = await ensureMiniCpm(); + expect(ollamaOk).toBeTrue(); + + // Qwen3 model + const qwenOk = await ensureQwen3(); + expect(qwenOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +tap.test('should have models available', async () => { + // Check Nanonets + const nanonetsResp = await fetch(`${NANONETS_URL}/models`); + expect(nanonetsResp.ok).toBeTrue(); + + // Check Qwen3 + const ollamaResp = await fetch(`${OLLAMA_URL}/api/tags`); + expect(ollamaResp.ok).toBeTrue(); + const data = await ollamaResp.json(); + const modelNames = data.models.map((m: { name: string }) => m.name); + expect(modelNames.some((name: string) => name.includes('qwen3'))).toBeTrue(); +}); + +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} bank statement test cases (Nanonets + Qwen3)\n`); + +let passedCount = 0; +let failedCount = 0; + +for (const testCase of testCases) { + tap.test(`should extract: ${testCase.name}`, async () => { + const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.length} transactions`); + + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + const extracted = await extractTransactions(images); + console.log(` Extracted: ${extracted.length} transactions`); + + const result = compareTransactions(extracted, expected); + const perfectMatch = result.matches === result.total && extracted.length === expected.length; + + if (perfectMatch) { + passedCount++; + console.log(` Result: PASS (${result.matches}/${result.total})`); + } else { + failedCount++; + console.log(` Result: FAIL (${result.matches}/${result.total})`); + result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`)); + } + + if (result.variations.length > 0) { + console.log(` Counterparty variations (${result.variations.length}):`); + result.variations.slice(0, 5).forEach((v) => console.log(` ${v}`)); + } + + expect(result.matches).toEqual(result.total); + expect(extracted.length).toEqual(expected.length); + }); +} + +tap.test('summary', async () => { + const total = testCases.length; + console.log(`\n======================================================`); + console.log(` Bank Statement Summary (Nanonets + Qwen3 Pipeline)`); + console.log(`======================================================`); + console.log(` Stage 1: Nanonets-OCR-s (document -> markdown)`); + console.log(` Stage 2: Qwen3 8B (markdown -> JSON)`); + console.log(` Passed: ${passedCount}/${total}`); + console.log(` Failed: ${failedCount}/${total}`); + console.log(`======================================================\n`); +}); + +export default tap.start(); diff --git a/test/test.invoices.nanonets.ts b/test/test.invoices.nanonets.ts new file mode 100644 index 0000000..f09978e --- /dev/null +++ b/test/test.invoices.nanonets.ts @@ -0,0 +1,584 @@ +/** + * Invoice extraction using Nanonets-OCR-s + Qwen3 (two-stage pipeline) + * + * Stage 1: Nanonets-OCR-s converts document pages to markdown (its strength) + * Stage 2: Qwen3 extracts structured JSON from the combined markdown + * + * This leverages each model's strengths: + * - Nanonets: Document OCR with semantic tags + * - Qwen3: Text understanding and JSON extraction + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureNanonetsOcr, ensureMiniCpm } from './helpers/docker.js'; + +const NANONETS_URL = 'http://localhost:8000/v1'; +const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s'; + +const OLLAMA_URL = 'http://localhost:11434'; +const QWEN_MODEL = 'qwen3:8b'; + +interface IInvoice { + invoice_number: string; + invoice_date: string; + vendor_name: string; + currency: string; + net_amount: number; + vat_amount: number; + total_amount: number; +} + +// Nanonets-specific prompt for document OCR to markdown +const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally. +Return the tables in html format. +Return the equations in LaTeX representation. +If there is an image in the document and image caption is not present, add a small description inside tag. +Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. +Page numbers should be wrapped in brackets. Ex: 14.`; + +// JSON extraction prompt for Qwen3 +const JSON_EXTRACTION_PROMPT = `You are an invoice data extractor. Below is an invoice document converted to text/markdown. Extract the key invoice fields as JSON. + +IMPORTANT RULES: +1. invoice_number: The unique invoice/document number (NOT VAT ID, NOT customer ID) +2. invoice_date: Format as YYYY-MM-DD +3. vendor_name: The company that issued the invoice +4. currency: EUR, USD, or GBP +5. net_amount: Amount before tax +6. vat_amount: Tax/VAT amount +7. total_amount: Final total (gross amount) + +Return ONLY this JSON format, no explanation: +{ + "invoice_number": "INV-2024-001", + "invoice_date": "2024-01-15", + "vendor_name": "Company Name", + "currency": "EUR", + "net_amount": 100.00, + "vat_amount": 19.00, + "total_amount": 119.00 +} + +INVOICE TEXT: +`; + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + // Use 150 DPI to keep images within model's context length + execSync( + `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Stage 1: Convert a single page to markdown using Nanonets-OCR-s + */ +async function convertPageToMarkdown(image: string, pageNum: number): Promise { + console.log(` [Nanonets] Converting page ${pageNum} to markdown...`); + const startTime = Date.now(); + + const response = await fetch(`${NANONETS_URL}/chat/completions`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer dummy', + }, + body: JSON.stringify({ + model: NANONETS_MODEL, + messages: [{ + role: 'user', + content: [ + { type: 'image_url', image_url: { url: `data:image/png;base64,${image}` }}, + { type: 'text', text: NANONETS_OCR_PROMPT }, + ], + }], + max_tokens: 4096, + temperature: 0.0, + }), + }); + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + + if (!response.ok) { + const errorText = await response.text(); + console.log(` [Nanonets] ERROR page ${pageNum}: ${response.status} - ${errorText}`); + throw new Error(`Nanonets API error: ${response.status}`); + } + + const data = await response.json(); + const content = (data.choices?.[0]?.message?.content || '').trim(); + console.log(` [Nanonets] Page ${pageNum} converted (${elapsed}s, ${content.length} chars)`); + return content; +} + +/** + * Stage 1: Convert all pages to markdown using Nanonets-OCR-s + */ +async function convertDocumentToMarkdown(images: string[]): Promise { + console.log(` [Stage 1] Converting ${images.length} page(s) to markdown with Nanonets-OCR-s...`); + + const markdownPages: string[] = []; + + for (let i = 0; i < images.length; i++) { + const markdown = await convertPageToMarkdown(images[i], i + 1); + markdownPages.push(`--- PAGE ${i + 1} ---\n${markdown}`); + } + + const fullMarkdown = markdownPages.join('\n\n'); + console.log(` [Stage 1] Complete: ${fullMarkdown.length} chars total`); + return fullMarkdown; +} + +/** + * Ensure Qwen3 model is available + */ +async function ensureQwen3(): Promise { + try { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + if (response.ok) { + const data = await response.json(); + const models = data.models || []; + if (models.some((m: { name: string }) => m.name === QWEN_MODEL)) { + console.log(` [Ollama] Model available: ${QWEN_MODEL}`); + return true; + } + } + } catch { + return false; + } + + console.log(` [Ollama] Pulling ${QWEN_MODEL}...`); + const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ name: QWEN_MODEL, stream: false }), + }); + + return pullResponse.ok; +} + +/** + * Parse amount from string (handles European format) + */ +function parseAmount(s: string | number | undefined): number { + if (s === undefined || s === null) return 0; + if (typeof s === 'number') return s; + const match = s.match(/([\d.,]+)/); + if (!match) return 0; + const numStr = match[1]; + // Handle European format: 1.234,56 -> 1234.56 + const normalized = numStr.includes(',') && numStr.indexOf(',') > numStr.lastIndexOf('.') + ? numStr.replace(/\./g, '').replace(',', '.') + : numStr.replace(/,/g, ''); + return parseFloat(normalized) || 0; +} + +/** + * Extract invoice number from potentially verbose response + */ +function extractInvoiceNumber(s: string | undefined): string { + if (!s) return ''; + let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); + const patterns = [ + /\b([A-Z]{2,3}\d{10,})\b/i, // IEE2022006460244 + /\b([A-Z]\d{8,})\b/i, // R0014359508 + /\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i, // INV-2024-001 + /\b(\d{7,})\b/, // 1579087430 + ]; + for (const pattern of patterns) { + const match = clean.match(pattern); + if (match) return match[1]; + } + return clean.replace(/[^A-Z0-9-]/gi, '').trim() || clean; +} + +/** + * Extract date (YYYY-MM-DD) from response + */ +function extractDate(s: string | undefined): string { + if (!s) return ''; + let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); + const isoMatch = clean.match(/(\d{4}-\d{2}-\d{2})/); + if (isoMatch) return isoMatch[1]; + // Try DD/MM/YYYY or DD.MM.YYYY + const dmyMatch = clean.match(/(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/); + if (dmyMatch) { + return `${dmyMatch[3]}-${dmyMatch[2].padStart(2, '0')}-${dmyMatch[1].padStart(2, '0')}`; + } + return clean.replace(/[^\d-]/g, '').trim(); +} + +/** + * Extract currency + */ +function extractCurrency(s: string | undefined): string { + if (!s) return 'EUR'; + const upper = s.toUpperCase(); + if (upper.includes('EUR') || upper.includes('€')) return 'EUR'; + if (upper.includes('USD') || upper.includes('$')) return 'USD'; + if (upper.includes('GBP') || upper.includes('£')) return 'GBP'; + return 'EUR'; +} + +/** + * Extract JSON from response (handles markdown code blocks) + */ +function extractJsonFromResponse(response: string): Record | null { + // Remove thinking tags if present (Qwen3 may include ...) + let cleanResponse = response.replace(/[\s\S]*?<\/think>/g, '').trim(); + + // Try to find JSON in markdown code block + const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/); + const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse; + + try { + return JSON.parse(jsonStr); + } catch { + // Try to find JSON object pattern + const jsonMatch = jsonStr.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + return JSON.parse(jsonMatch[0]); + } catch { + return null; + } + } + return null; + } +} + +/** + * Parse JSON response into IInvoice + */ +function parseJsonToInvoice(response: string): IInvoice | null { + const parsed = extractJsonFromResponse(response); + if (!parsed) return null; + + return { + invoice_number: extractInvoiceNumber(String(parsed.invoice_number || '')), + invoice_date: extractDate(String(parsed.invoice_date || '')), + vendor_name: String(parsed.vendor_name || '').replace(/\*\*/g, '').replace(/`/g, '').trim(), + currency: extractCurrency(String(parsed.currency || '')), + net_amount: parseAmount(parsed.net_amount as string | number), + vat_amount: parseAmount(parsed.vat_amount as string | number), + total_amount: parseAmount(parsed.total_amount as string | number), + }; +} + +/** + * Stage 2: Extract invoice from markdown using Qwen3 + */ +async function extractInvoiceFromMarkdown(markdown: string, queryId: string): Promise { + console.log(` [${queryId}] Sending markdown to ${QWEN_MODEL}...`); + const startTime = Date.now(); + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: QWEN_MODEL, + messages: [{ + role: 'user', + content: JSON_EXTRACTION_PROMPT + markdown, + }], + stream: false, + options: { + num_predict: 2000, + temperature: 0.1, + }, + }), + }); + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + + if (!response.ok) { + console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`); + throw new Error(`Ollama API error: ${response.status}`); + } + + const data = await response.json(); + const content = (data.message?.content || '').trim(); + console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`); + + return parseJsonToInvoice(content); +} + +/** + * Compare two invoices for consensus (key fields must match) + */ +function invoicesMatch(a: IInvoice, b: IInvoice): boolean { + const numMatch = a.invoice_number.toLowerCase() === b.invoice_number.toLowerCase(); + const dateMatch = a.invoice_date === b.invoice_date; + const totalMatch = Math.abs(a.total_amount - b.total_amount) < 0.02; + return numMatch && dateMatch && totalMatch; +} + +/** + * Stage 2: Extract invoice using Qwen3 with consensus + */ +async function extractWithConsensus(markdown: string): Promise { + const MAX_ATTEMPTS = 3; + console.log(` [Stage 2] Extracting invoice with ${QWEN_MODEL} (consensus)...`); + + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + console.log(`\n [Stage 2] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`); + + // Extract twice + const inv1 = await extractInvoiceFromMarkdown(markdown, `A${attempt}Q1`); + const inv2 = await extractInvoiceFromMarkdown(markdown, `A${attempt}Q2`); + + if (!inv1 || !inv2) { + console.log(` [Stage 2] Parsing failed, retrying...`); + continue; + } + + console.log(` [Stage 2] Q1: ${inv1.invoice_number} | ${inv1.invoice_date} | ${inv1.total_amount} ${inv1.currency}`); + console.log(` [Stage 2] Q2: ${inv2.invoice_number} | ${inv2.invoice_date} | ${inv2.total_amount} ${inv2.currency}`); + + if (invoicesMatch(inv1, inv2)) { + console.log(` [Stage 2] CONSENSUS REACHED`); + return inv2; + } + + console.log(` [Stage 2] NO CONSENSUS`); + } + + // Fallback: use last response + console.log(`\n [Stage 2] === FALLBACK ===`); + const fallback = await extractInvoiceFromMarkdown(markdown, 'FALLBACK'); + + if (fallback) { + console.log(` [Stage 2] ~ FALLBACK: ${fallback.invoice_number} | ${fallback.invoice_date} | ${fallback.total_amount}`); + return fallback; + } + + // Return empty invoice if all else fails + return { + invoice_number: '', + invoice_date: '', + vendor_name: '', + currency: 'EUR', + net_amount: 0, + vat_amount: 0, + total_amount: 0, + }; +} + +/** + * Full pipeline: PDF -> Images -> Markdown -> JSON + */ +async function extractInvoice(images: string[]): Promise { + // Stage 1: Convert to markdown + const markdown = await convertDocumentToMarkdown(images); + + // Stage 2: Extract invoice with consensus + const invoice = await extractWithConsensus(markdown); + + return invoice; +} + +/** + * Normalize date to YYYY-MM-DD + */ +function normalizeDate(dateStr: string | null): string { + if (!dateStr) return ''; + if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr; + + const monthMap: Record = { + JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', + JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', + }; + + let match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); + if (match) { + return `${match[3]}-${monthMap[match[2].toUpperCase()] || '01'}-${match[1].padStart(2, '0')}`; + } + + match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); + if (match) { + return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`; + } + + return dateStr; +} + +/** + * Compare extracted invoice against expected + */ +function compareInvoice( + extracted: IInvoice, + expected: IInvoice +): { match: boolean; errors: string[] } { + const errors: string[] = []; + + // Compare invoice number (normalize by removing spaces and case) + const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + if (extNum !== expNum) { + errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); + } + + // Compare date + if (normalizeDate(extracted.invoice_date) !== normalizeDate(expected.invoice_date)) { + errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); + } + + // Compare total amount (with tolerance) + if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { + errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); + } + + // Compare currency + if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { + errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); + } + + return { match: errors.length === 0, errors }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit/invoices'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + testCases.sort((a, b) => a.name.localeCompare(b.name)); + return testCases; +} + +// Tests + +tap.test('setup: ensure containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Nanonets for OCR + const nanonetsOk = await ensureNanonetsOcr(); + expect(nanonetsOk).toBeTrue(); + + // Ollama for Qwen3 + const ollamaOk = await ensureMiniCpm(); + expect(ollamaOk).toBeTrue(); + + // Qwen3 model + const qwenOk = await ensureQwen3(); + expect(qwenOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +tap.test('should have models available', async () => { + // Check Nanonets + const nanonetsResp = await fetch(`${NANONETS_URL}/models`); + expect(nanonetsResp.ok).toBeTrue(); + + // Check Qwen3 + const ollamaResp = await fetch(`${OLLAMA_URL}/api/tags`); + expect(ollamaResp.ok).toBeTrue(); + const data = await ollamaResp.json(); + const modelNames = data.models.map((m: { name: string }) => m.name); + expect(modelNames.some((name: string) => name.includes('qwen3'))).toBeTrue(); +}); + +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} invoice test cases (Nanonets + Qwen3)\n`); + +let passedCount = 0; +let failedCount = 0; +const processingTimes: number[] = []; + +for (const testCase of testCases) { + tap.test(`should extract invoice: ${testCase.name}`, async () => { + const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); + + const startTime = Date.now(); + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + const extracted = await extractInvoice(images); + console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`); + + const elapsedMs = Date.now() - startTime; + processingTimes.push(elapsedMs); + + const result = compareInvoice(extracted, expected); + + if (result.match) { + passedCount++; + console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + } else { + failedCount++; + console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + result.errors.forEach((e) => console.log(` - ${e}`)); + } + + expect(result.match).toBeTrue(); + }); +} + +tap.test('summary', async () => { + const totalInvoices = testCases.length; + const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; + const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); + const avgTimeSec = processingTimes.length > 0 ? totalTimeMs / processingTimes.length / 1000 : 0; + + console.log(`\n========================================`); + console.log(` Invoice Extraction Summary`); + console.log(` (Nanonets + Qwen3 Pipeline)`); + console.log(`========================================`); + console.log(` Stage 1: Nanonets-OCR-s (doc -> md)`); + console.log(` Stage 2: Qwen3 8B (md -> JSON)`); + console.log(` Passed: ${passedCount}/${totalInvoices}`); + console.log(` Failed: ${failedCount}/${totalInvoices}`); + console.log(` Accuracy: ${accuracy.toFixed(1)}%`); + console.log(`----------------------------------------`); + console.log(` Total time: ${(totalTimeMs / 1000).toFixed(1)}s`); + console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); + console.log(`========================================\n`); +}); + +export default tap.start();