diff --git a/changelog.md b/changelog.md index 6fd5a68..4d4471a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-18 - 1.13.2 - fix(tests) +stabilize OCR extraction tests and manage GPU containers + +- Add stopAllGpuContainers() and call it before starting GPU images to free GPU memory. +- Remove PaddleOCR-VL image configs and associated ensure helpers from docker test helper to simplify images list. +- Split invoice/bankstatement tests into two sequential stages: Stage 1 runs Nanonets OCR to produce markdown files, Stage 2 stops Nanonets and runs model extraction from saved markdown (avoids GPU contention). +- Introduce temporary markdown directory handling and cleanup; add stopNanonets() and container running checks in tests. +- Switch bank statement extraction model from qwen3:8b to gpt-oss:20b; add request timeout and improved logging/console output across tests. +- Refactor extractWithConsensus and extraction functions to accept document identifiers, improve error messages and JSON extraction robustness. + ## 2026-01-18 - 1.13.1 - fix(image_support_files) remove PaddleOCR-VL server scripts from image_support_files diff --git a/test/helpers/docker.ts b/test/helpers/docker.ts index 7c8f77c..dec42ef 100644 --- a/test/helpers/docker.ts +++ b/test/helpers/docker.ts @@ -2,10 +2,6 @@ import { execSync } from 'child_process'; // Project container names (only manage these) const PROJECT_CONTAINERS = [ - 'paddleocr-vl-test', - 'paddleocr-vl-gpu-test', - 'paddleocr-vl-cpu-test', - 'paddleocr-vl-full-test', 'minicpm-test', 'nanonets-test', ]; @@ -24,30 +20,6 @@ export interface IImageConfig { } export const IMAGES = { - paddleocrVlGpu: { - name: 'paddleocr-vl-gpu', - dockerfile: 'Dockerfile_paddleocr_vl_gpu', - buildContext: '.', - containerName: 'paddleocr-vl-test', - ports: ['8000:8000'], - volumes: ['ht-huggingface-cache:/root/.cache/huggingface'], - gpus: true, - healthEndpoint: 'http://localhost:8000/health', - healthTimeout: 300000, // 5 minutes for model loading - } as IImageConfig, - - paddleocrVlCpu: { - name: 'paddleocr-vl-cpu', - dockerfile: 'Dockerfile_paddleocr_vl_cpu', - buildContext: '.', - containerName: 'paddleocr-vl-test', - ports: ['8000:8000'], - volumes: ['ht-huggingface-cache:/root/.cache/huggingface'], - gpus: false, - healthEndpoint: 'http://localhost:8000/health', - healthTimeout: 300000, - } as IImageConfig, - minicpm: { name: 'minicpm45v', dockerfile: 'Dockerfile_minicpm45v_gpu', @@ -60,22 +32,6 @@ export const IMAGES = { healthTimeout: 120000, } as IImageConfig, - // Full PaddleOCR-VL pipeline with PP-DocLayoutV2 + structured JSON output - paddleocrVlFull: { - name: 'paddleocr-vl-full', - dockerfile: 'Dockerfile_paddleocr_vl_full', - buildContext: '.', - containerName: 'paddleocr-vl-full-test', - ports: ['8000:8000'], - volumes: [ - 'ht-huggingface-cache:/root/.cache/huggingface', - 'ht-paddleocr-cache:/root/.paddleocr', - ], - gpus: true, - healthEndpoint: 'http://localhost:8000/health', - healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2) - } as IImageConfig, - // Nanonets-OCR-s - Document OCR optimized VLM (Qwen2.5-VL-3B fine-tuned) nanonetsOcr: { name: 'nanonets-ocr', @@ -140,7 +96,7 @@ export function removeContainer(containerName: string): void { } /** - * Stop all project containers that conflict with the required one + * Stop all project containers that conflict with the required one (port-based) */ export function stopConflictingContainers(requiredContainer: string, requiredPort: string): void { // Stop project containers using the same port @@ -158,6 +114,24 @@ export function stopConflictingContainers(requiredContainer: string, requiredPor } } +/** + * Stop all GPU-consuming project containers (for GPU memory management) + * This ensures GPU memory is freed before starting a new GPU service + */ +export function stopAllGpuContainers(exceptContainer?: string): void { + for (const container of PROJECT_CONTAINERS) { + if (container === exceptContainer) continue; + + if (isContainerRunning(container)) { + console.log(`[Docker] Stopping GPU container: ${container}`); + exec(`docker stop ${container}`, true); + // Give the GPU a moment to free memory + } + } + // Brief pause to allow GPU memory to be released + execSync('sleep 2'); +} + /** * Build a Docker image */ @@ -234,6 +208,11 @@ export async function ensureService(config: IImageConfig): Promise { buildImage(config); } + // For GPU services, stop ALL other GPU containers to free GPU memory + if (config.gpus) { + stopAllGpuContainers(config.containerName); + } + // Stop conflicting containers on the same port const mainPort = config.ports[0]; stopConflictingContainers(config.containerName, mainPort); @@ -254,21 +233,7 @@ export async function ensureService(config: IImageConfig): Promise { } /** - * Ensure PaddleOCR-VL GPU service is running - */ -export async function ensurePaddleOcrVlGpu(): Promise { - return ensureService(IMAGES.paddleocrVlGpu); -} - -/** - * Ensure PaddleOCR-VL CPU service is running - */ -export async function ensurePaddleOcrVlCpu(): Promise { - return ensureService(IMAGES.paddleocrVlCpu); -} - -/** - * Ensure MiniCPM service is running + * Ensure MiniCPM service is running (Ollama with GPU) */ export async function ensureMiniCpm(): Promise { return ensureService(IMAGES.minicpm); @@ -286,30 +251,6 @@ export function isGpuAvailable(): boolean { } } -/** - * Ensure PaddleOCR-VL service (auto-detect GPU/CPU) - */ -export async function ensurePaddleOcrVl(): Promise { - if (isGpuAvailable()) { - console.log('[Docker] GPU detected, using GPU image'); - return ensurePaddleOcrVlGpu(); - } else { - console.log('[Docker] No GPU detected, using CPU image'); - return ensurePaddleOcrVlCpu(); - } -} - -/** - * Ensure PaddleOCR-VL Full Pipeline service (PP-DocLayoutV2 + structured output) - * This is the recommended service for production use - outputs structured JSON/Markdown - */ -export async function ensurePaddleOcrVlFull(): Promise { - if (!isGpuAvailable()) { - console.log('[Docker] WARNING: Full pipeline requires GPU, but none detected'); - } - return ensureService(IMAGES.paddleocrVlFull); -} - /** * Ensure an Ollama model is pulled and available * Uses the MiniCPM container (which runs Ollama) to pull the model diff --git a/test/test.bankstatements.nanonets.ts b/test/test.bankstatements.nanonets.ts index 287b891..0a945af 100644 --- a/test/test.bankstatements.nanonets.ts +++ b/test/test.bankstatements.nanonets.ts @@ -1,25 +1,26 @@ /** - * Bank statement extraction using Nanonets-OCR-s + Qwen3 (two-stage pipeline) + * Bank statement extraction using Nanonets-OCR-s + GPT-OSS 20B (sequential two-stage pipeline) * - * Stage 1: Nanonets-OCR-s converts document pages to markdown (its strength) - * Stage 2: Qwen3 extracts structured JSON from the combined markdown + * Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion) + * Stage 2: GPT-OSS 20B extracts structured JSON from saved markdown (after Nanonets stops) * - * This leverages each model's strengths: - * - Nanonets: Document OCR with semantic tags - * - Qwen3: Text understanding and JSON extraction + * This approach avoids GPU contention by running services sequentially. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; -import { ensureNanonetsOcr, ensureMiniCpm } from './helpers/docker.js'; +import { ensureNanonetsOcr, ensureMiniCpm, removeContainer, isContainerRunning } from './helpers/docker.js'; const NANONETS_URL = 'http://localhost:8000/v1'; const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s'; const OLLAMA_URL = 'http://localhost:11434'; -const QWEN_MODEL = 'qwen3:8b'; +const EXTRACTION_MODEL = 'gpt-oss:20b'; + +// Temp directory for storing markdown between stages +const TEMP_MD_DIR = path.join(os.tmpdir(), 'nanonets-markdown'); interface ITransaction { date: string; @@ -27,6 +28,14 @@ interface ITransaction { amount: number; } +interface ITestCase { + name: string; + pdfPath: string; + jsonPath: string; + markdownPath?: string; + images?: string[]; +} + // Nanonets-specific prompt for document OCR to markdown const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally. Return the tables in html format. @@ -35,24 +44,10 @@ If there is an image in the document and image caption is not present, add a sma Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. Page numbers should be wrapped in brackets. Ex: 14.`; -// JSON extraction prompt for Qwen3 -const JSON_EXTRACTION_PROMPT = `You are a financial data extractor. Below is a bank statement converted to text/markdown. Extract ALL transactions from it as a JSON array. +// JSON extraction prompt for GPT-OSS 20B +const JSON_EXTRACTION_PROMPT = `Extract ALL transactions from this bank statement as JSON array. Each transaction: {"date": "YYYY-MM-DD", "counterparty": "NAME", "amount": -25.99}. Amount negative for debits, positive for credits. Only include actual transactions, not balances. Return ONLY JSON array, no explanation. -IMPORTANT RULES: -1. Each transaction has: date, description/counterparty, and an amount -2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals) -3. Amount is POSITIVE for money coming IN (credits, deposits, refunds) -4. Date format: YYYY-MM-DD -5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows -6. Only include actual transactions with a specific date and amount - -Return ONLY this JSON format, no explanation: -[ - {"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99}, - {"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00} -] - -BANK STATEMENT TEXT: +STATEMENT: `; /** @@ -63,7 +58,6 @@ function convertPdfToImages(pdfPath: string): string[] { const outputPattern = path.join(tempDir, 'page-%d.png'); try { - // Use 150 DPI to keep images within model's context length execSync( `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } @@ -85,10 +79,9 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Stage 1: Convert a single page to markdown using Nanonets-OCR-s + * Convert a single page to markdown using Nanonets-OCR-s */ async function convertPageToMarkdown(image: string, pageNum: number): Promise { - console.log(` [Nanonets] Converting page ${pageNum} to markdown...`); const startTime = Date.now(); const response = await fetch(`${NANONETS_URL}/chat/completions`, { @@ -115,21 +108,20 @@ async function convertPageToMarkdown(image: string, pageNum: number): Promise { - console.log(` [Stage 1] Converting ${images.length} page(s) to markdown with Nanonets-OCR-s...`); +async function convertDocumentToMarkdown(images: string[], docName: string): Promise { + console.log(` [${docName}] Converting ${images.length} page(s)...`); const markdownPages: string[] = []; @@ -139,21 +131,55 @@ async function convertDocumentToMarkdown(images: string[]): Promise { } const fullMarkdown = markdownPages.join('\n\n'); - console.log(` [Stage 1] Complete: ${fullMarkdown.length} chars total`); + console.log(` [${docName}] Complete: ${fullMarkdown.length} chars total`); return fullMarkdown; } /** - * Ensure Qwen3 model is available + * Stop Nanonets container */ -async function ensureQwen3(): Promise { +function stopNanonets(): void { + console.log(' [Docker] Stopping Nanonets container...'); + try { + execSync('docker stop nanonets-test 2>/dev/null || true', { stdio: 'pipe' }); + // Wait for GPU memory to be released + execSync('sleep 5', { stdio: 'pipe' }); + console.log(' [Docker] Nanonets stopped'); + } catch { + console.log(' [Docker] Nanonets was not running'); + } +} + +/** + * Ensure GPT-OSS 20B model is available and warmed up + */ +async function ensureExtractionModel(): Promise { try { const response = await fetch(`${OLLAMA_URL}/api/tags`); if (response.ok) { const data = await response.json(); const models = data.models || []; - if (models.some((m: { name: string }) => m.name === QWEN_MODEL)) { - console.log(` [Ollama] Model available: ${QWEN_MODEL}`); + if (models.some((m: { name: string }) => m.name === EXTRACTION_MODEL)) { + console.log(` [Ollama] Model available: ${EXTRACTION_MODEL}`); + + // Warmup: send a simple request to ensure model is loaded + console.log(` [Ollama] Warming up model...`); + const warmupResponse = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: EXTRACTION_MODEL, + messages: [{ role: 'user', content: 'Return: [{"test": 1}]' }], + stream: false, + }), + signal: AbortSignal.timeout(120000), + }); + + if (warmupResponse.ok) { + const warmupData = await warmupResponse.json(); + console.log(` [Ollama] Warmup complete (${warmupData.message?.content?.length || 0} chars)`); + } + return true; } } @@ -161,77 +187,92 @@ async function ensureQwen3(): Promise { return false; } - console.log(` [Ollama] Pulling ${QWEN_MODEL}...`); + console.log(` [Ollama] Pulling ${EXTRACTION_MODEL}...`); const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ name: QWEN_MODEL, stream: false }), + body: JSON.stringify({ name: EXTRACTION_MODEL, stream: false }), }); return pullResponse.ok; } /** - * Stage 2: Extract transactions from markdown using Qwen3 + * Extract transactions from markdown using GPT-OSS 20B (streaming) */ async function extractTransactionsFromMarkdown(markdown: string, queryId: string): Promise { - console.log(` [${queryId}] Sending markdown to ${QWEN_MODEL}...`); + console.log(` [${queryId}] Sending to ${EXTRACTION_MODEL}...`); + console.log(` [${queryId}] Markdown length: ${markdown.length}`); const startTime = Date.now(); + const fullPrompt = JSON_EXTRACTION_PROMPT + markdown; + console.log(` [${queryId}] Prompt preview: ${fullPrompt.substring(0, 200)}...`); + const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ - model: QWEN_MODEL, + model: EXTRACTION_MODEL, messages: [{ role: 'user', - content: JSON_EXTRACTION_PROMPT + markdown, + content: fullPrompt, }], - stream: false, - options: { - num_predict: 8000, - temperature: 0.1, - }, + stream: true, }), + signal: AbortSignal.timeout(600000), // 10 minute timeout }); - const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); - if (!response.ok) { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`); throw new Error(`Ollama API error: ${response.status}`); } - const data = await response.json(); - const content = (data.message?.content || '').trim(); - console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`); + // Stream the response and log to console + let content = ''; + const reader = response.body!.getReader(); + const decoder = new TextDecoder(); + + process.stdout.write(` [${queryId}] `); + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + // Each line is a JSON object + for (const line of chunk.split('\n').filter(l => l.trim())) { + try { + const json = JSON.parse(line); + const token = json.message?.content || ''; + if (token) { + process.stdout.write(token); + content += token; + } + } catch { + // Ignore parse errors for partial chunks + } + } + } + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + console.log(`\n [${queryId}] Done: ${content.length} chars (${elapsed}s)`); return parseJsonResponse(content, queryId); } /** - * Sanitize JSON string - fix common issues from LLM output + * Sanitize JSON string */ function sanitizeJson(jsonStr: string): string { let s = jsonStr; - - // Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix s = s.replace(/"amount"\s*:\s*\+/g, '"amount": '); s = s.replace(/:\s*\+(\d)/g, ': $1'); - - // Fix European number format with thousands separator s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4'); - - // Fix trailing commas before ] or } s = s.replace(/,\s*([}\]])/g, '$1'); - - // Fix unescaped newlines/tabs inside strings s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"'); s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"'); - - // Remove control characters s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' '); - return s; } @@ -243,7 +284,6 @@ function parseAmount(value: unknown): number { if (typeof value !== 'string') return 0; let s = value.replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); - // European format: comma is decimal if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) { s = s.replace(/\./g, '').replace(',', '.'); } else { @@ -256,16 +296,14 @@ function parseAmount(value: unknown): number { * Parse JSON response into transactions */ function parseJsonResponse(response: string, queryId: string): ITransaction[] { - console.log(` [${queryId}] Parsing response...`); - - // Remove thinking tags if present (Qwen3 may include ...) + // Remove thinking tags if present let cleanResponse = response.replace(/[\s\S]*?<\/think>/g, '').trim(); - // Try to find JSON in markdown code block + // Debug: show what we're working with + console.log(` [${queryId}] Response preview: ${cleanResponse.substring(0, 300)}...`); + const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/); let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse; - - // Sanitize JSON jsonStr = sanitizeJson(jsonStr); try { @@ -280,11 +318,10 @@ function parseJsonResponse(response: string, queryId: string): ITransaction[] { return txs; } } catch (e) { - console.log(` [${queryId}] Direct parse failed: ${(e as Error).message}`); - - // Try to find JSON array pattern + // Try to find a JSON array in the text const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); if (arrayMatch) { + console.log(` [${queryId}] Array match found: ${arrayMatch[0].length} chars`); try { const parsed = JSON.parse(sanitizeJson(arrayMatch[0])); if (Array.isArray(parsed)) { @@ -296,93 +333,36 @@ function parseJsonResponse(response: string, queryId: string): ITransaction[] { console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`); return txs; } - } catch (e2) { - console.log(` [${queryId}] Array parse failed: ${(e2 as Error).message}`); + } catch (innerErr) { + console.log(` [${queryId}] Array parse error: ${(innerErr as Error).message}`); } + } else { + console.log(` [${queryId}] No JSON array found in response`); } } - console.log(` [${queryId}] PARSE FAILED - returning empty array`); + console.log(` [${queryId}] PARSE FAILED`); return []; } /** - * Compare two transaction arrays for consensus + * Extract transactions (single pass) */ -function transactionArraysMatch(a: ITransaction[], b: ITransaction[]): boolean { - if (a.length !== b.length) return false; - - for (let i = 0; i < a.length; i++) { - const dateMatch = a[i].date === b[i].date; - const amountMatch = Math.abs(a[i].amount - b[i].amount) < 0.01; - if (!dateMatch || !amountMatch) return false; - } - - return true; +async function extractTransactions(markdown: string, docName: string): Promise { + console.log(` [${docName}] Extracting...`); + const txs = await extractTransactionsFromMarkdown(markdown, docName); + console.log(` [${docName}] Extracted ${txs.length} transactions`); + return txs; } /** - * Stage 2: Extract transactions using Qwen3 with consensus - */ -async function extractWithConsensus(markdown: string): Promise { - const MAX_ATTEMPTS = 3; - console.log(` [Stage 2] Extracting transactions with ${QWEN_MODEL} (consensus)...`); - - for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { - console.log(`\n [Stage 2] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`); - - // Extract twice in parallel - const [txs1, txs2] = await Promise.all([ - extractTransactionsFromMarkdown(markdown, `A${attempt}Q1`), - extractTransactionsFromMarkdown(markdown, `A${attempt}Q2`), - ]); - - console.log(` [Stage 2] Results: Q1=${txs1.length} txs, Q2=${txs2.length} txs`); - - if (txs1.length > 0 && transactionArraysMatch(txs1, txs2)) { - console.log(` [Stage 2] CONSENSUS REACHED: ${txs1.length} transactions`); - return txs1; - } - - console.log(` [Stage 2] NO CONSENSUS`); - } - - // Fallback: use last response - console.log(`\n [Stage 2] === FALLBACK ===`); - const fallback = await extractTransactionsFromMarkdown(markdown, 'FALLBACK'); - console.log(` [Stage 2] ~ FALLBACK RESULT: ${fallback.length} transactions`); - return fallback; -} - -/** - * Full pipeline: PDF -> Images -> Markdown -> JSON - */ -async function extractTransactions(images: string[]): Promise { - // Stage 1: Convert to markdown - const markdown = await convertDocumentToMarkdown(images); - - // Stage 2: Extract transactions with consensus - const transactions = await extractWithConsensus(markdown); - - // Log all transactions - console.log(`\n [Result] Extracted ${transactions.length} transactions:`); - for (let i = 0; i < transactions.length; i++) { - const tx = transactions[i]; - console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); - } - - return transactions; -} - -/** - * Compare extracted transactions against expected + * Compare transactions */ function compareTransactions( extracted: ITransaction[], expected: ITransaction[] -): { matches: number; total: number; errors: string[]; variations: string[] } { +): { matches: number; total: number; errors: string[] } { const errors: string[] = []; - const variations: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { @@ -390,7 +370,7 @@ function compareTransactions( const ext = extracted[i]; if (!ext) { - errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); + errors.push(`Missing tx ${i}: ${exp.date} ${exp.counterparty}`); continue; } @@ -399,11 +379,8 @@ function compareTransactions( if (dateMatch && amountMatch) { matches++; - if (ext.counterparty !== exp.counterparty) { - variations.push(`[${i}] "${exp.counterparty}" -> "${ext.counterparty}"`); - } } else { - errors.push(`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`); + errors.push(`Mismatch ${i}: exp ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`); } } @@ -411,23 +388,20 @@ function compareTransactions( errors.push(`Extra transactions: ${extracted.length - expected.length}`); } - return { matches, total: expected.length, errors, variations }; + return { matches, total: expected.length, errors }; } /** - * Find all test cases (PDF + JSON pairs) in .nogit/ + * Find all test cases */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { +function findTestCases(): ITestCase[] { const testDir = path.join(process.cwd(), '.nogit'); - if (!fs.existsSync(testDir)) { - return []; - } + if (!fs.existsSync(testDir)) return []; const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + const testCases: ITestCase[] = []; - for (const pdf of pdfFiles) { + for (const pdf of files.filter((f: string) => f.endsWith('.pdf'))) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { @@ -442,72 +416,142 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin return testCases.sort((a, b) => a.name.localeCompare(b.name)); } -// Tests +// ============ TESTS ============ -tap.test('setup: ensure containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} bank statement test cases\n`); - // Nanonets for OCR - const nanonetsOk = await ensureNanonetsOcr(); - expect(nanonetsOk).toBeTrue(); +// Ensure temp directory exists +if (!fs.existsSync(TEMP_MD_DIR)) { + fs.mkdirSync(TEMP_MD_DIR, { recursive: true }); +} + +// -------- STAGE 1: OCR with Nanonets -------- + +// Check if all markdown files already exist +function allMarkdownFilesExist(): boolean { + for (const tc of testCases) { + const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + if (!fs.existsSync(mdPath)) { + return false; + } + } + return true; +} + +// Track whether we need to run Stage 1 +let stage1Needed = !allMarkdownFilesExist(); + +tap.test('Stage 1: Setup Nanonets', async () => { + console.log('\n========== STAGE 1: Nanonets OCR ==========\n'); + + if (!stage1Needed) { + console.log(' [SKIP] All markdown files already exist, skipping Nanonets setup'); + return; + } + + const ok = await ensureNanonetsOcr(); + expect(ok).toBeTrue(); +}); + +tap.test('Stage 1: Convert all documents to markdown', async () => { + if (!stage1Needed) { + console.log(' [SKIP] Using existing markdown files from previous run\n'); + // Load existing markdown paths + for (const tc of testCases) { + tc.markdownPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + console.log(` Loaded: ${tc.markdownPath}`); + } + return; + } + + console.log('\n Converting all PDFs to markdown with Nanonets-OCR-s...\n'); + + for (const tc of testCases) { + console.log(`\n === ${tc.name} ===`); + + // Convert PDF to images + const images = convertPdfToImages(tc.pdfPath); + console.log(` Pages: ${images.length}`); + + // Convert to markdown + const markdown = await convertDocumentToMarkdown(images, tc.name); + + // Save markdown to temp file + const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + fs.writeFileSync(mdPath, markdown); + tc.markdownPath = mdPath; + console.log(` Saved: ${mdPath}`); + } + + console.log('\n Stage 1 complete: All documents converted to markdown\n'); +}); + +tap.test('Stage 1: Stop Nanonets', async () => { + if (!stage1Needed) { + console.log(' [SKIP] Nanonets was not started'); + return; + } + + stopNanonets(); + // Verify it's stopped + await new Promise(resolve => setTimeout(resolve, 3000)); + expect(isContainerRunning('nanonets-test')).toBeFalse(); +}); + +// -------- STAGE 2: Extraction with GPT-OSS 20B -------- + +tap.test('Stage 2: Setup Ollama + GPT-OSS 20B', async () => { + console.log('\n========== STAGE 2: GPT-OSS 20B Extraction ==========\n'); - // Ollama for Qwen3 const ollamaOk = await ensureMiniCpm(); expect(ollamaOk).toBeTrue(); - // Qwen3 model - const qwenOk = await ensureQwen3(); - expect(qwenOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); + const extractionOk = await ensureExtractionModel(); + expect(extractionOk).toBeTrue(); }); -tap.test('should have models available', async () => { - // Check Nanonets - const nanonetsResp = await fetch(`${NANONETS_URL}/models`); - expect(nanonetsResp.ok).toBeTrue(); - - // Check Qwen3 - const ollamaResp = await fetch(`${OLLAMA_URL}/api/tags`); - expect(ollamaResp.ok).toBeTrue(); - const data = await ollamaResp.json(); - const modelNames = data.models.map((m: { name: string }) => m.name); - expect(modelNames.some((name: string) => name.includes('qwen3'))).toBeTrue(); -}); - -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} bank statement test cases (Nanonets + Qwen3)\n`); - let passedCount = 0; let failedCount = 0; -for (const testCase of testCases) { - tap.test(`should extract: ${testCase.name}`, async () => { - const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.length} transactions`); +for (const tc of testCases) { + tap.test(`Stage 2: Extract ${tc.name}`, async () => { + const expected: ITransaction[] = JSON.parse(fs.readFileSync(tc.jsonPath, 'utf-8')); + console.log(`\n === ${tc.name} ===`); + console.log(` Expected: ${expected.length} transactions`); - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); + // Load saved markdown + const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + if (!fs.existsSync(mdPath)) { + throw new Error(`Markdown not found: ${mdPath}. Run Stage 1 first.`); + } + const markdown = fs.readFileSync(mdPath, 'utf-8'); + console.log(` Markdown: ${markdown.length} chars`); - const extracted = await extractTransactions(images); - console.log(` Extracted: ${extracted.length} transactions`); + // Extract transactions (single pass) + const extracted = await extractTransactions(markdown, tc.name); - const result = compareTransactions(extracted, expected); - const perfectMatch = result.matches === result.total && extracted.length === expected.length; - - if (perfectMatch) { - passedCount++; - console.log(` Result: PASS (${result.matches}/${result.total})`); - } else { - failedCount++; - console.log(` Result: FAIL (${result.matches}/${result.total})`); - result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`)); + // Log results + console.log(` Extracted: ${extracted.length} transactions`); + for (let i = 0; i < Math.min(extracted.length, 5); i++) { + const tx = extracted[i]; + console.log(` ${i + 1}. ${tx.date} | ${tx.counterparty.substring(0, 25).padEnd(25)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); + } + if (extracted.length > 5) { + console.log(` ... and ${extracted.length - 5} more`); } - if (result.variations.length > 0) { - console.log(` Counterparty variations (${result.variations.length}):`); - result.variations.slice(0, 5).forEach((v) => console.log(` ${v}`)); + // Compare + const result = compareTransactions(extracted, expected); + const pass = result.matches === result.total && extracted.length === expected.length; + + if (pass) { + passedCount++; + console.log(` Result: PASS (${result.matches}/${result.total})`); + } else { + failedCount++; + console.log(` Result: FAIL (${result.matches}/${result.total})`); + result.errors.slice(0, 5).forEach(e => console.log(` - ${e}`)); } expect(result.matches).toEqual(result.total); @@ -515,16 +559,27 @@ for (const testCase of testCases) { }); } -tap.test('summary', async () => { - const total = testCases.length; +tap.test('Summary', async () => { console.log(`\n======================================================`); - console.log(` Bank Statement Summary (Nanonets + Qwen3 Pipeline)`); + console.log(` Bank Statement Summary (Nanonets + GPT-OSS 20B Sequential)`); console.log(`======================================================`); console.log(` Stage 1: Nanonets-OCR-s (document -> markdown)`); - console.log(` Stage 2: Qwen3 8B (markdown -> JSON)`); - console.log(` Passed: ${passedCount}/${total}`); - console.log(` Failed: ${failedCount}/${total}`); + console.log(` Stage 2: GPT-OSS 20B (markdown -> JSON)`); + console.log(` Passed: ${passedCount}/${testCases.length}`); + console.log(` Failed: ${failedCount}/${testCases.length}`); console.log(`======================================================\n`); + + // Only cleanup temp files if ALL tests passed + if (failedCount === 0 && passedCount === testCases.length) { + try { + fs.rmSync(TEMP_MD_DIR, { recursive: true, force: true }); + console.log(` Cleaned up temp directory: ${TEMP_MD_DIR}\n`); + } catch { + // Ignore + } + } else { + console.log(` Keeping temp directory for debugging: ${TEMP_MD_DIR}\n`); + } }); export default tap.start(); diff --git a/test/test.invoices.nanonets.ts b/test/test.invoices.nanonets.ts index f09978e..aba6b35 100644 --- a/test/test.invoices.nanonets.ts +++ b/test/test.invoices.nanonets.ts @@ -1,19 +1,17 @@ /** - * Invoice extraction using Nanonets-OCR-s + Qwen3 (two-stage pipeline) + * Invoice extraction using Nanonets-OCR-s + Qwen3 (sequential two-stage pipeline) * - * Stage 1: Nanonets-OCR-s converts document pages to markdown (its strength) - * Stage 2: Qwen3 extracts structured JSON from the combined markdown + * Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion) + * Stage 2: Qwen3 extracts structured JSON from saved markdown (after Nanonets stops) * - * This leverages each model's strengths: - * - Nanonets: Document OCR with semantic tags - * - Qwen3: Text understanding and JSON extraction + * This approach avoids GPU contention by running services sequentially. */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; -import { ensureNanonetsOcr, ensureMiniCpm } from './helpers/docker.js'; +import { ensureNanonetsOcr, ensureMiniCpm, isContainerRunning } from './helpers/docker.js'; const NANONETS_URL = 'http://localhost:8000/v1'; const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s'; @@ -21,6 +19,9 @@ const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s'; const OLLAMA_URL = 'http://localhost:11434'; const QWEN_MODEL = 'qwen3:8b'; +// Temp directory for storing markdown between stages +const TEMP_MD_DIR = path.join(os.tmpdir(), 'nanonets-invoices-markdown'); + interface IInvoice { invoice_number: string; invoice_date: string; @@ -31,6 +32,13 @@ interface IInvoice { total_amount: number; } +interface ITestCase { + name: string; + pdfPath: string; + jsonPath: string; + markdownPath?: string; +} + // Nanonets-specific prompt for document OCR to markdown const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally. Return the tables in html format. @@ -66,14 +74,13 @@ INVOICE TEXT: `; /** - * Convert PDF to PNG images using ImageMagick + * Convert PDF to PNG images */ function convertPdfToImages(pdfPath: string): string[] { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); const outputPattern = path.join(tempDir, 'page-%d.png'); try { - // Use 150 DPI to keep images within model's context length execSync( `convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } @@ -95,10 +102,9 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Stage 1: Convert a single page to markdown using Nanonets-OCR-s + * Convert a single page to markdown using Nanonets-OCR-s */ async function convertPageToMarkdown(image: string, pageNum: number): Promise { - console.log(` [Nanonets] Converting page ${pageNum} to markdown...`); const startTime = Date.now(); const response = await fetch(`${NANONETS_URL}/chat/completions`, { @@ -125,21 +131,20 @@ async function convertPageToMarkdown(image: string, pageNum: number): Promise { - console.log(` [Stage 1] Converting ${images.length} page(s) to markdown with Nanonets-OCR-s...`); +async function convertDocumentToMarkdown(images: string[], docName: string): Promise { + console.log(` [${docName}] Converting ${images.length} page(s)...`); const markdownPages: string[] = []; @@ -149,10 +154,24 @@ async function convertDocumentToMarkdown(images: string[]): Promise { } const fullMarkdown = markdownPages.join('\n\n'); - console.log(` [Stage 1] Complete: ${fullMarkdown.length} chars total`); + console.log(` [${docName}] Complete: ${fullMarkdown.length} chars total`); return fullMarkdown; } +/** + * Stop Nanonets container + */ +function stopNanonets(): void { + console.log(' [Docker] Stopping Nanonets container...'); + try { + execSync('docker stop nanonets-test 2>/dev/null || true', { stdio: 'pipe' }); + execSync('sleep 5', { stdio: 'pipe' }); + console.log(' [Docker] Nanonets stopped'); + } catch { + console.log(' [Docker] Nanonets was not running'); + } +} + /** * Ensure Qwen3 model is available */ @@ -190,7 +209,6 @@ function parseAmount(s: string | number | undefined): number { const match = s.match(/([\d.,]+)/); if (!match) return 0; const numStr = match[1]; - // Handle European format: 1.234,56 -> 1234.56 const normalized = numStr.includes(',') && numStr.indexOf(',') > numStr.lastIndexOf('.') ? numStr.replace(/\./g, '').replace(',', '.') : numStr.replace(/,/g, ''); @@ -204,10 +222,10 @@ function extractInvoiceNumber(s: string | undefined): string { if (!s) return ''; let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); const patterns = [ - /\b([A-Z]{2,3}\d{10,})\b/i, // IEE2022006460244 - /\b([A-Z]\d{8,})\b/i, // R0014359508 - /\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i, // INV-2024-001 - /\b(\d{7,})\b/, // 1579087430 + /\b([A-Z]{2,3}\d{10,})\b/i, + /\b([A-Z]\d{8,})\b/i, + /\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i, + /\b(\d{7,})\b/, ]; for (const pattern of patterns) { const match = clean.match(pattern); @@ -224,7 +242,6 @@ function extractDate(s: string | undefined): string { let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); const isoMatch = clean.match(/(\d{4}-\d{2}-\d{2})/); if (isoMatch) return isoMatch[1]; - // Try DD/MM/YYYY or DD.MM.YYYY const dmyMatch = clean.match(/(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/); if (dmyMatch) { return `${dmyMatch[3]}-${dmyMatch[2].padStart(2, '0')}-${dmyMatch[1].padStart(2, '0')}`; @@ -245,20 +262,16 @@ function extractCurrency(s: string | undefined): string { } /** - * Extract JSON from response (handles markdown code blocks) + * Extract JSON from response */ function extractJsonFromResponse(response: string): Record | null { - // Remove thinking tags if present (Qwen3 may include ...) let cleanResponse = response.replace(/[\s\S]*?<\/think>/g, '').trim(); - - // Try to find JSON in markdown code block const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/); const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse; try { return JSON.parse(jsonStr); } catch { - // Try to find JSON object pattern const jsonMatch = jsonStr.match(/\{[\s\S]*\}/); if (jsonMatch) { try { @@ -290,15 +303,16 @@ function parseJsonToInvoice(response: string): IInvoice | null { } /** - * Stage 2: Extract invoice from markdown using Qwen3 + * Extract invoice from markdown using Qwen3 */ async function extractInvoiceFromMarkdown(markdown: string, queryId: string): Promise { - console.log(` [${queryId}] Sending markdown to ${QWEN_MODEL}...`); + console.log(` [${queryId}] Sending to ${QWEN_MODEL}...`); const startTime = Date.now(); const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, + signal: AbortSignal.timeout(600000), // 10 minute timeout for large documents body: JSON.stringify({ model: QWEN_MODEL, messages: [{ @@ -322,13 +336,13 @@ async function extractInvoiceFromMarkdown(markdown: string, queryId: string): Pr const data = await response.json(); const content = (data.message?.content || '').trim(); - console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`); + console.log(` [${queryId}] Response: ${content.length} chars (${elapsed}s)`); return parseJsonToInvoice(content); } /** - * Compare two invoices for consensus (key fields must match) + * Compare two invoices for consensus */ function invoicesMatch(a: IInvoice, b: IInvoice): boolean { const numMatch = a.invoice_number.toLowerCase() === b.invoice_number.toLowerCase(); @@ -338,45 +352,39 @@ function invoicesMatch(a: IInvoice, b: IInvoice): boolean { } /** - * Stage 2: Extract invoice using Qwen3 with consensus + * Extract with consensus */ -async function extractWithConsensus(markdown: string): Promise { +async function extractWithConsensus(markdown: string, docName: string): Promise { const MAX_ATTEMPTS = 3; - console.log(` [Stage 2] Extracting invoice with ${QWEN_MODEL} (consensus)...`); for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { - console.log(`\n [Stage 2] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`); + console.log(` [${docName}] Attempt ${attempt}/${MAX_ATTEMPTS}`); - // Extract twice - const inv1 = await extractInvoiceFromMarkdown(markdown, `A${attempt}Q1`); - const inv2 = await extractInvoiceFromMarkdown(markdown, `A${attempt}Q2`); + const inv1 = await extractInvoiceFromMarkdown(markdown, `${docName}-A${attempt}Q1`); + const inv2 = await extractInvoiceFromMarkdown(markdown, `${docName}-A${attempt}Q2`); if (!inv1 || !inv2) { - console.log(` [Stage 2] Parsing failed, retrying...`); + console.log(` [${docName}] Parsing failed, retrying...`); continue; } - console.log(` [Stage 2] Q1: ${inv1.invoice_number} | ${inv1.invoice_date} | ${inv1.total_amount} ${inv1.currency}`); - console.log(` [Stage 2] Q2: ${inv2.invoice_number} | ${inv2.invoice_date} | ${inv2.total_amount} ${inv2.currency}`); + console.log(` [${docName}] Q1: ${inv1.invoice_number} | ${inv1.invoice_date} | ${inv1.total_amount}`); + console.log(` [${docName}] Q2: ${inv2.invoice_number} | ${inv2.invoice_date} | ${inv2.total_amount}`); if (invoicesMatch(inv1, inv2)) { - console.log(` [Stage 2] CONSENSUS REACHED`); + console.log(` [${docName}] CONSENSUS`); return inv2; } - - console.log(` [Stage 2] NO CONSENSUS`); + console.log(` [${docName}] No consensus`); } - // Fallback: use last response - console.log(`\n [Stage 2] === FALLBACK ===`); - const fallback = await extractInvoiceFromMarkdown(markdown, 'FALLBACK'); - + // Fallback + const fallback = await extractInvoiceFromMarkdown(markdown, `${docName}-FALLBACK`); if (fallback) { - console.log(` [Stage 2] ~ FALLBACK: ${fallback.invoice_number} | ${fallback.invoice_date} | ${fallback.total_amount}`); + console.log(` [${docName}] FALLBACK: ${fallback.invoice_number} | ${fallback.invoice_date} | ${fallback.total_amount}`); return fallback; } - // Return empty invoice if all else fails return { invoice_number: '', invoice_date: '', @@ -388,19 +396,6 @@ async function extractWithConsensus(markdown: string): Promise { }; } -/** - * Full pipeline: PDF -> Images -> Markdown -> JSON - */ -async function extractInvoice(images: string[]): Promise { - // Stage 1: Convert to markdown - const markdown = await convertDocumentToMarkdown(images); - - // Stage 2: Extract invoice with consensus - const invoice = await extractWithConsensus(markdown); - - return invoice; -} - /** * Normalize date to YYYY-MM-DD */ @@ -435,45 +430,38 @@ function compareInvoice( ): { match: boolean; errors: string[] } { const errors: string[] = []; - // Compare invoice number (normalize by removing spaces and case) const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; if (extNum !== expNum) { - errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); + errors.push(`invoice_number: exp "${expected.invoice_number}", got "${extracted.invoice_number}"`); } - // Compare date if (normalizeDate(extracted.invoice_date) !== normalizeDate(expected.invoice_date)) { - errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); + errors.push(`invoice_date: exp "${expected.invoice_date}", got "${extracted.invoice_date}"`); } - // Compare total amount (with tolerance) if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { - errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); + errors.push(`total_amount: exp ${expected.total_amount}, got ${extracted.total_amount}`); } - // Compare currency if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { - errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); + errors.push(`currency: exp "${expected.currency}", got "${extracted.currency}"`); } return { match: errors.length === 0, errors }; } /** - * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ + * Find all test cases */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { +function findTestCases(): ITestCase[] { const testDir = path.join(process.cwd(), '.nogit/invoices'); - if (!fs.existsSync(testDir)) { - return []; - } + if (!fs.existsSync(testDir)) return []; const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + const testCases: ITestCase[] = []; - for (const pdf of pdfFiles) { + for (const pdf of files.filter((f) => f.endsWith('.pdf'))) { const baseName = pdf.replace('.pdf', ''); const jsonFile = `${baseName}.json`; if (files.includes(jsonFile)) { @@ -485,90 +473,114 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin } } - testCases.sort((a, b) => a.name.localeCompare(b.name)); - return testCases; + return testCases.sort((a, b) => a.name.localeCompare(b.name)); } -// Tests +// ============ TESTS ============ -tap.test('setup: ensure containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} invoice test cases\n`); - // Nanonets for OCR - const nanonetsOk = await ensureNanonetsOcr(); - expect(nanonetsOk).toBeTrue(); +// Ensure temp directory exists +if (!fs.existsSync(TEMP_MD_DIR)) { + fs.mkdirSync(TEMP_MD_DIR, { recursive: true }); +} + +// -------- STAGE 1: OCR with Nanonets -------- + +tap.test('Stage 1: Setup Nanonets', async () => { + console.log('\n========== STAGE 1: Nanonets OCR ==========\n'); + const ok = await ensureNanonetsOcr(); + expect(ok).toBeTrue(); +}); + +tap.test('Stage 1: Convert all invoices to markdown', async () => { + console.log('\n Converting all invoice PDFs to markdown with Nanonets-OCR-s...\n'); + + for (const tc of testCases) { + console.log(`\n === ${tc.name} ===`); + + const images = convertPdfToImages(tc.pdfPath); + console.log(` Pages: ${images.length}`); + + const markdown = await convertDocumentToMarkdown(images, tc.name); + + const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + fs.writeFileSync(mdPath, markdown); + tc.markdownPath = mdPath; + console.log(` Saved: ${mdPath}`); + } + + console.log('\n Stage 1 complete: All invoices converted to markdown\n'); +}); + +tap.test('Stage 1: Stop Nanonets', async () => { + stopNanonets(); + await new Promise(resolve => setTimeout(resolve, 3000)); + expect(isContainerRunning('nanonets-test')).toBeFalse(); +}); + +// -------- STAGE 2: Extraction with Qwen3 -------- + +tap.test('Stage 2: Setup Ollama + Qwen3', async () => { + console.log('\n========== STAGE 2: Qwen3 Extraction ==========\n'); - // Ollama for Qwen3 const ollamaOk = await ensureMiniCpm(); expect(ollamaOk).toBeTrue(); - // Qwen3 model const qwenOk = await ensureQwen3(); expect(qwenOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); }); -tap.test('should have models available', async () => { - // Check Nanonets - const nanonetsResp = await fetch(`${NANONETS_URL}/models`); - expect(nanonetsResp.ok).toBeTrue(); - - // Check Qwen3 - const ollamaResp = await fetch(`${OLLAMA_URL}/api/tags`); - expect(ollamaResp.ok).toBeTrue(); - const data = await ollamaResp.json(); - const modelNames = data.models.map((m: { name: string }) => m.name); - expect(modelNames.some((name: string) => name.includes('qwen3'))).toBeTrue(); -}); - -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} invoice test cases (Nanonets + Qwen3)\n`); - let passedCount = 0; let failedCount = 0; const processingTimes: number[] = []; -for (const testCase of testCases) { - tap.test(`should extract invoice: ${testCase.name}`, async () => { - const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); +for (const tc of testCases) { + tap.test(`Stage 2: Extract ${tc.name}`, async () => { + const expected: IInvoice = JSON.parse(fs.readFileSync(tc.jsonPath, 'utf-8')); + console.log(`\n === ${tc.name} ===`); + console.log(` Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); const startTime = Date.now(); - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - const extracted = await extractInvoice(images); - console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`); + const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`); + if (!fs.existsSync(mdPath)) { + throw new Error(`Markdown not found: ${mdPath}. Run Stage 1 first.`); + } + const markdown = fs.readFileSync(mdPath, 'utf-8'); + console.log(` Markdown: ${markdown.length} chars`); + + const extracted = await extractWithConsensus(markdown, tc.name); const elapsedMs = Date.now() - startTime; processingTimes.push(elapsedMs); + console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`); + const result = compareInvoice(extracted, expected); if (result.match) { passedCount++; - console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); } else { failedCount++; - console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); - result.errors.forEach((e) => console.log(` - ${e}`)); + console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + result.errors.forEach(e => console.log(` - ${e}`)); } expect(result.match).toBeTrue(); }); } -tap.test('summary', async () => { +tap.test('Summary', async () => { const totalInvoices = testCases.length; const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); const avgTimeSec = processingTimes.length > 0 ? totalTimeMs / processingTimes.length / 1000 : 0; console.log(`\n========================================`); - console.log(` Invoice Extraction Summary`); - console.log(` (Nanonets + Qwen3 Pipeline)`); + console.log(` Invoice Summary (Nanonets + Qwen3)`); console.log(`========================================`); console.log(` Stage 1: Nanonets-OCR-s (doc -> md)`); console.log(` Stage 2: Qwen3 8B (md -> JSON)`); @@ -579,6 +591,14 @@ tap.test('summary', async () => { console.log(` Total time: ${(totalTimeMs / 1000).toFixed(1)}s`); console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); console.log(`========================================\n`); + + // Cleanup temp files + try { + fs.rmSync(TEMP_MD_DIR, { recursive: true, force: true }); + console.log(` Cleaned up temp directory: ${TEMP_MD_DIR}\n`); + } catch { + // Ignore + } }); export default tap.start();