From d91df70fff112f71c3899ad0b5126dadadd28f3c Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sun, 18 Jan 2026 13:56:46 +0000 Subject: [PATCH] feat(tests): revamp tests and remove legacy Dockerfiles: adopt JSON/consensus workflows, switch MiniCPM model, and delete deprecated Docker/test variants --- Dockerfile_minicpm45v_cpu | 27 - Dockerfile_paddleocr_vl_cpu | 57 -- Dockerfile_paddleocr_vl_full | 90 --- Dockerfile_paddleocr_vl_gpu | 71 --- changelog.md | 9 + .../paddleocr_vl_entrypoint.sh | 19 - .../paddleocr_vl_full_entrypoint.sh | 12 - test/test.bankstatements.combined.ts | 549 ------------------ test/test.bankstatements.minicpm.ts | 398 ++++++++++--- test/test.bankstatements.ministral3.ts | 348 ----------- test/test.bankstatements.paddleocr-vl.ts | 346 ----------- test/test.invoices.combined.ts | 455 --------------- test/test.invoices.minicpm.ts | 479 ++++++++------- test/test.invoices.ministral3.ts | 334 ----------- test/test.invoices.paddleocr-vl.ts | 490 ---------------- 15 files changed, 542 insertions(+), 3142 deletions(-) delete mode 100644 Dockerfile_minicpm45v_cpu delete mode 100644 Dockerfile_paddleocr_vl_cpu delete mode 100644 Dockerfile_paddleocr_vl_full delete mode 100644 Dockerfile_paddleocr_vl_gpu delete mode 100644 image_support_files/paddleocr_vl_entrypoint.sh delete mode 100644 image_support_files/paddleocr_vl_full_entrypoint.sh delete mode 100644 test/test.bankstatements.combined.ts delete mode 100644 test/test.bankstatements.ministral3.ts delete mode 100644 test/test.bankstatements.paddleocr-vl.ts delete mode 100644 test/test.invoices.combined.ts delete mode 100644 test/test.invoices.ministral3.ts delete mode 100644 test/test.invoices.paddleocr-vl.ts diff --git a/Dockerfile_minicpm45v_cpu b/Dockerfile_minicpm45v_cpu deleted file mode 100644 index f2c4159..0000000 --- a/Dockerfile_minicpm45v_cpu +++ /dev/null @@ -1,27 +0,0 @@ -# MiniCPM-V 4.5 CPU Variant -# Vision-Language Model optimized for CPU-only inference -FROM ollama/ollama:latest - -LABEL maintainer="Task Venture Capital GmbH " -LABEL description="MiniCPM-V 4.5 Vision-Language Model - CPU optimized (GGUF)" -LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" - -# Environment configuration for CPU-only mode -ENV MODEL_NAME="minicpm-v" -ENV OLLAMA_HOST="0.0.0.0" -ENV OLLAMA_ORIGINS="*" -# Disable GPU usage for CPU-only variant -ENV CUDA_VISIBLE_DEVICES="" - -# Copy and setup entrypoint -COPY image_support_files/minicpm45v_entrypoint.sh /usr/local/bin/docker-entrypoint.sh -RUN chmod +x /usr/local/bin/docker-entrypoint.sh - -# Expose Ollama API port -EXPOSE 11434 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ - CMD curl -f http://localhost:11434/api/tags || exit 1 - -ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] diff --git a/Dockerfile_paddleocr_vl_cpu b/Dockerfile_paddleocr_vl_cpu deleted file mode 100644 index 842527a..0000000 --- a/Dockerfile_paddleocr_vl_cpu +++ /dev/null @@ -1,57 +0,0 @@ -# PaddleOCR-VL CPU Variant -# Vision-Language Model for document parsing using transformers (slower, no GPU required) -FROM python:3.11-slim-bookworm - -LABEL maintainer="Task Venture Capital GmbH " -LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing" -LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" - -# Environment configuration -ENV PYTHONUNBUFFERED=1 -ENV HF_HOME=/root/.cache/huggingface -ENV CUDA_VISIBLE_DEVICES="" -ENV SERVER_PORT=8000 -ENV SERVER_HOST=0.0.0.0 - -# Set working directory -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - libgl1-mesa-glx \ - libglib2.0-0 \ - libgomp1 \ - curl \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Install Python dependencies -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \ - pip install --no-cache-dir \ - transformers \ - accelerate \ - safetensors \ - pillow \ - fastapi \ - uvicorn[standard] \ - python-multipart \ - httpx \ - protobuf \ - sentencepiece \ - einops - -# Copy server files -COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py -COPY image_support_files/paddleocr_vl_entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh -RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh - -# Expose API port -EXPOSE 8000 - -# Health check (longer start-period for CPU + model download) -HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - -ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"] diff --git a/Dockerfile_paddleocr_vl_full b/Dockerfile_paddleocr_vl_full deleted file mode 100644 index 81f613f..0000000 --- a/Dockerfile_paddleocr_vl_full +++ /dev/null @@ -1,90 +0,0 @@ -# PaddleOCR-VL Full Pipeline (PP-DocLayoutV2 + PaddleOCR-VL + Structured Output) -# Self-contained GPU image with complete document parsing pipeline -FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 - -LABEL maintainer="Task Venture Capital GmbH " -LABEL description="PaddleOCR-VL Full Pipeline - Layout Detection + VL Recognition + JSON/Markdown Output" -LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" - -# Environment configuration -ENV DEBIAN_FRONTEND=noninteractive -ENV PYTHONUNBUFFERED=1 -ENV HF_HOME=/root/.cache/huggingface -ENV PADDLEOCR_HOME=/root/.paddleocr -ENV SERVER_PORT=8000 -ENV SERVER_HOST=0.0.0.0 -ENV VLM_PORT=8080 - -# Set working directory -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.11 \ - python3.11-venv \ - python3.11-dev \ - python3-pip \ - libgl1-mesa-glx \ - libglib2.0-0 \ - libgomp1 \ - libsm6 \ - libxext6 \ - libxrender1 \ - curl \ - git \ - wget \ - && rm -rf /var/lib/apt/lists/* \ - && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 - -# Create and activate virtual environment -RUN python -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -# Upgrade pip -RUN pip install --no-cache-dir --upgrade pip setuptools wheel - -# Install PaddlePaddle GPU (CUDA 12.x) -RUN pip install --no-cache-dir \ - paddlepaddle-gpu==3.2.1 \ - --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ - -# Install PaddleOCR with doc-parser (includes PP-DocLayoutV2) -RUN pip install --no-cache-dir \ - "paddleocr[doc-parser]" \ - safetensors - -# Install PyTorch with CUDA support -RUN pip install --no-cache-dir \ - torch==2.5.1 \ - torchvision \ - --index-url https://download.pytorch.org/whl/cu124 - -# Install transformers for PaddleOCR-VL inference (no vLLM - use local inference) -# PaddleOCR-VL requires transformers>=4.55.0 for use_kernel_forward_from_hub -RUN pip install --no-cache-dir \ - transformers>=4.55.0 \ - accelerate \ - hf-kernels - -# Install our API server dependencies -RUN pip install --no-cache-dir \ - fastapi \ - uvicorn[standard] \ - python-multipart \ - httpx \ - pillow - -# Copy server files -COPY image_support_files/paddleocr_vl_full_server.py /app/server.py -COPY image_support_files/paddleocr_vl_full_entrypoint.sh /usr/local/bin/entrypoint.sh -RUN chmod +x /usr/local/bin/entrypoint.sh - -# Expose ports (8000 = API, 8080 = internal VLM server) -EXPOSE 8000 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - -ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/Dockerfile_paddleocr_vl_gpu b/Dockerfile_paddleocr_vl_gpu deleted file mode 100644 index 5628ef2..0000000 --- a/Dockerfile_paddleocr_vl_gpu +++ /dev/null @@ -1,71 +0,0 @@ -# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM) -# Vision-Language Model for document parsing using transformers with CUDA -FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 - -LABEL maintainer="Task Venture Capital GmbH " -LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers" -LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" - -# Environment configuration -ENV DEBIAN_FRONTEND=noninteractive -ENV PYTHONUNBUFFERED=1 -ENV HF_HOME=/root/.cache/huggingface -ENV SERVER_PORT=8000 -ENV SERVER_HOST=0.0.0.0 - -# Set working directory -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.11 \ - python3.11-venv \ - python3.11-dev \ - python3-pip \ - libgl1-mesa-glx \ - libglib2.0-0 \ - libgomp1 \ - curl \ - git \ - && rm -rf /var/lib/apt/lists/* \ - && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 - -# Create and activate virtual environment -RUN python -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -# Install PyTorch with CUDA support -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - torch==2.5.1 \ - torchvision \ - --index-url https://download.pytorch.org/whl/cu124 - -# Install Python dependencies (transformers-based, not vLLM) -RUN pip install --no-cache-dir \ - transformers \ - accelerate \ - safetensors \ - pillow \ - fastapi \ - uvicorn[standard] \ - python-multipart \ - httpx \ - protobuf \ - sentencepiece \ - einops - -# Copy server files (same as CPU variant - it auto-detects CUDA) -COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py -COPY image_support_files/paddleocr_vl_entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh -RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh - -# Expose API port -EXPOSE 8000 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - -ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"] diff --git a/changelog.md b/changelog.md index 1177871..e631d16 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,14 @@ # Changelog +## 2026-01-18 - 1.13.0 - feat(tests) +revamp tests and remove legacy Dockerfiles: adopt JSON/consensus workflows, switch MiniCPM model, and delete deprecated Docker/test variants + +- Removed multiple Dockerfiles and related entrypoints for MiniCPM and PaddleOCR-VL (cpu/gpu/full), cleaning up legacy image recipes. +- Pruned many older test files (combined, ministral3, paddleocr-vl, and several invoice/test variants) to consolidate the test suite. +- Updated bank statement MiniCPM test: now uses MODEL='openbmb/minicpm-v4.5:q8_0', JSON per-page extraction prompt, consensus retry logic, expanded logging, and stricter result matching. +- Updated invoice MiniCPM test: switched to a consensus flow (fast JSON pass + thinking pass), increased PDF conversion quality, endpoints migrated to chat-style API calls with image-in-message payloads, and improved finalization logic. +- API usage changed from /api/generate to /api/chat with message-based payloads and embedded images — CI and local test runners will need model availability and possible pipeline adjustments. + ## 2026-01-18 - 1.12.0 - feat(tests) switch vision tests to multi-query extraction (count then per-row/field queries) and add logging/summaries diff --git a/image_support_files/paddleocr_vl_entrypoint.sh b/image_support_files/paddleocr_vl_entrypoint.sh deleted file mode 100644 index fc23695..0000000 --- a/image_support_files/paddleocr_vl_entrypoint.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e - -echo "===================================" -echo "PaddleOCR-VL Server (CPU)" -echo "===================================" - -HOST="${SERVER_HOST:-0.0.0.0}" -PORT="${SERVER_PORT:-8000}" - -echo "Host: ${HOST}" -echo "Port: ${PORT}" -echo "Device: CPU (no GPU)" -echo "" - -echo "Starting PaddleOCR-VL CPU server..." -echo "===================================" - -exec python /app/paddleocr_vl_server.py diff --git a/image_support_files/paddleocr_vl_full_entrypoint.sh b/image_support_files/paddleocr_vl_full_entrypoint.sh deleted file mode 100644 index 1a75ed0..0000000 --- a/image_support_files/paddleocr_vl_full_entrypoint.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -e - -echo "Starting PaddleOCR-VL Full Pipeline Server (Transformers backend)..." - -# Environment -SERVER_PORT=${SERVER_PORT:-8000} -SERVER_HOST=${SERVER_HOST:-0.0.0.0} - -# Start our API server directly (no vLLM - uses local transformers inference) -echo "Starting API server on port $SERVER_PORT..." -exec python /app/server.py diff --git a/test/test.bankstatements.combined.ts b/test/test.bankstatements.combined.ts deleted file mode 100644 index 4a238e6..0000000 --- a/test/test.bankstatements.combined.ts +++ /dev/null @@ -1,549 +0,0 @@ -/** - * Bank statement extraction test using MiniCPM-V (visual) + PaddleOCR-VL (table recognition) - * - * This is the combined/dual-VLM approach that uses both models for consensus: - * - MiniCPM-V for visual extraction - * - PaddleOCR-VL for table recognition - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js'; - -// Service URLs -const OLLAMA_URL = 'http://localhost:11434'; -const PADDLEOCR_VL_URL = 'http://localhost:8000'; - -// Models -const MINICPM_MODEL = 'minicpm-v:latest'; -const PADDLEOCR_VL_MODEL = 'paddleocr-vl'; - -// Prompt for MiniCPM-V visual extraction -const MINICPM_EXTRACT_PROMPT = `/nothink -You are a bank statement parser. Extract EVERY transaction from the table. - -Read the Amount column carefully: -- "- 21,47 €" means DEBIT, output as: -21.47 -- "+ 1.000,00 €" means CREDIT, output as: 1000.00 -- European format: comma = decimal point - -For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} - -Do not skip any rows. Return ONLY the JSON array, no explanation.`; - -// Prompt for PaddleOCR-VL table extraction -const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`; - -// Post-processing prompt to convert PaddleOCR-VL output to JSON -const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink -Convert the following bank statement table data to JSON. - -Read the Amount values carefully: -- "- 21,47 €" means DEBIT, output as: -21.47 -- "+ 1.000,00 €" means CREDIT, output as: 1000.00 -- European format: comma = decimal point - -For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} - -Return ONLY the JSON array, no explanation. - -Table data: ---- -{TABLE_DATA} ----`; - -interface ITransaction { - date: string; - counterparty: string; - amount: number; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - execSync( - `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Extract using MiniCPM-V via Ollama - */ -async function extractWithMiniCPM(images: string[], passLabel: string): Promise { - const payload = { - model: MINICPM_MODEL, - prompt: MINICPM_EXTRACT_PROMPT, - images, - stream: true, - options: { - num_predict: 16384, - temperature: 0.1, - }, - }; - - const response = await fetch(`${OLLAMA_URL}/api/generate`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - let lineBuffer = ''; - - console.log(`[${passLabel}] Extracting with MiniCPM-V...`); - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.response) { - fullText += json.response; - lineBuffer += json.response; - - if (lineBuffer.includes('\n')) { - const parts = lineBuffer.split('\n'); - for (let i = 0; i < parts.length - 1; i++) { - console.log(parts[i]); - } - lineBuffer = parts[parts.length - 1]; - } - } - } catch { - // Skip invalid JSON lines - } - } - } - - if (lineBuffer) { - console.log(lineBuffer); - } - console.log(''); - - const startIdx = fullText.indexOf('['); - const endIdx = fullText.lastIndexOf(']') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error('No JSON array found in response'); - } - - return JSON.parse(fullText.substring(startIdx, endIdx)); -} - -/** - * Extract table using PaddleOCR-VL via OpenAI-compatible API - */ -async function extractTableWithPaddleOCRVL(imageBase64: string): Promise { - const payload = { - model: PADDLEOCR_VL_MODEL, - messages: [ - { - role: 'user', - content: [ - { - type: 'image_url', - image_url: { url: `data:image/png;base64,${imageBase64}` }, - }, - { - type: 'text', - text: PADDLEOCR_VL_TABLE_PROMPT, - }, - ], - }, - ], - temperature: 0.0, - max_tokens: 8192, - }; - - const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), - }); - - if (!response.ok) { - const text = await response.text(); - throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); - } - - const data = await response.json(); - return data.choices?.[0]?.message?.content || ''; -} - -/** - * Convert PaddleOCR-VL table output to transactions using MiniCPM-V - */ -async function convertTableToTransactions( - tableData: string, - passLabel: string -): Promise { - const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData); - - const payload = { - model: MINICPM_MODEL, - prompt, - stream: true, - options: { - num_predict: 16384, - temperature: 0.1, - }, - }; - - const response = await fetch(`${OLLAMA_URL}/api/generate`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - console.log(`[${passLabel}] Converting table data to JSON...`); - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.response) { - fullText += json.response; - } - } catch { - // Skip invalid JSON lines - } - } - } - - const startIdx = fullText.indexOf('['); - const endIdx = fullText.lastIndexOf(']') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error('No JSON array found in response'); - } - - return JSON.parse(fullText.substring(startIdx, endIdx)); -} - -/** - * Extract using PaddleOCR-VL (table recognition) + conversion - */ -async function extractWithPaddleOCRVL( - images: string[], - passLabel: string -): Promise { - console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`); - - // Extract table data from each page - const tableDataParts: string[] = []; - for (let i = 0; i < images.length; i++) { - console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`); - const tableData = await extractTableWithPaddleOCRVL(images[i]); - if (tableData.trim()) { - tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`); - } - } - - const combinedTableData = tableDataParts.join('\n\n'); - console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`); - - // Convert to transactions - return convertTableToTransactions(combinedTableData, passLabel); -} - -/** - * Create a hash of transactions for comparison - */ -function hashTransactions(transactions: ITransaction[]): string { - return transactions - .map((t) => `${t.date}|${t.amount.toFixed(2)}`) - .sort() - .join(';'); -} - -/** - * Check if PaddleOCR-VL service is available - */ -async function isPaddleOCRVLAvailable(): Promise { - try { - const response = await fetch(`${PADDLEOCR_VL_URL}/health`, { - method: 'GET', - signal: AbortSignal.timeout(5000), - }); - return response.ok; - } catch { - return false; - } -} - -/** - * Extract with dual-VLM consensus - * Strategy: - * Pass 1 = MiniCPM-V visual extraction - * Pass 2 = PaddleOCR-VL table recognition (if available) - * Pass 3+ = MiniCPM-V visual (fallback) - */ -async function extractWithConsensus( - images: string[], - maxPasses: number = 5 -): Promise { - const results: Array<{ transactions: ITransaction[]; hash: string }> = []; - const hashCounts: Map = new Map(); - - const addResult = (transactions: ITransaction[], passLabel: string): number => { - const hash = hashTransactions(transactions); - results.push({ transactions, hash }); - hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); - console.log( - `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)` - ); - return hashCounts.get(hash)!; - }; - - // Check if PaddleOCR-VL is available - const paddleOCRVLAvailable = await isPaddleOCRVLAvailable(); - if (paddleOCRVLAvailable) { - console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus'); - } else { - console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only'); - } - - // Pass 1: MiniCPM-V visual extraction - try { - const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V'); - addResult(pass1Result, 'Pass 1 MiniCPM-V'); - } catch (err) { - console.log(`[Pass 1] Error: ${err}`); - } - - // Pass 2: PaddleOCR-VL table recognition (if available) - if (paddleOCRVLAvailable) { - try { - const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL'); - const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL'); - if (count >= 2) { - console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!'); - return pass2Result; - } - } catch (err) { - console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`); - } - } - - // Pass 3+: Continue with MiniCPM-V visual passes - const startPass = paddleOCRVLAvailable ? 3 : 2; - for (let pass = startPass; pass <= maxPasses; pass++) { - try { - const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`); - const count = addResult(transactions, `Pass ${pass} MiniCPM-V`); - - if (count >= 2) { - console.log(`[Consensus] Reached after ${pass} passes`); - return transactions; - } - - console.log(`[Pass ${pass}] No consensus yet, trying again...`); - } catch (err) { - console.log(`[Pass ${pass}] Error: ${err}`); - } - } - - // No consensus reached - return the most common result - let bestHash = ''; - let bestCount = 0; - for (const [hash, count] of hashCounts) { - if (count > bestCount) { - bestCount = count; - bestHash = hash; - } - } - - if (!bestHash) { - throw new Error('No valid results obtained'); - } - - const best = results.find((r) => r.hash === bestHash)!; - console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); - return best.transactions; -} - -/** - * Compare extracted transactions against expected - */ -function compareTransactions( - extracted: ITransaction[], - expected: ITransaction[] -): { matches: number; total: number; errors: string[] } { - const errors: string[] = []; - let matches = 0; - - for (let i = 0; i < expected.length; i++) { - const exp = expected[i]; - const ext = extracted[i]; - - if (!ext) { - errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); - continue; - } - - const dateMatch = ext.date === exp.date; - const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; - - if (dateMatch && amountMatch) { - matches++; - } else { - errors.push( - `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}` - ); - } - } - - if (extracted.length > expected.length) { - errors.push(`Extra transactions: ${extracted.length - expected.length}`); - } - - return { matches, total: expected.length, errors }; -} - -/** - * Find all test cases (PDF + JSON pairs) in .nogit/ - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit'); - if (!fs.existsSync(testDir)) { - return []; - } - - const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of pdfFiles) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - - return testCases; -} - -// Tests - -tap.test('setup: ensure Docker containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); - - // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU) - const paddleOk = await ensurePaddleOcrVl(); - expect(paddleOk).toBeTrue(); - - // Ensure MiniCPM is running - const minicpmOk = await ensureMiniCpm(); - expect(minicpmOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); -}); - -tap.test('should have MiniCPM-V 4.5 model loaded', async () => { - const response = await fetch(`${OLLAMA_URL}/api/tags`); - const data = await response.json(); - const modelNames = data.models.map((m: { name: string }) => m.name); - expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); -}); - -tap.test('should check PaddleOCR-VL availability', async () => { - const available = await isPaddleOCRVLAvailable(); - console.log(`PaddleOCR-VL available: ${available}`); - expect(available).toBeTrue(); -}); - -// Dynamic test for each PDF/JSON pair -const testCases = findTestCases(); -for (const testCase of testCases) { - tap.test(`should extract transactions from ${testCase.name}`, async () => { - // Load expected transactions - const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.length} transactions`); - - // Convert PDF to images - console.log('Converting PDF to images...'); - const images = convertPdfToImages(testCase.pdfPath); - console.log(`Converted: ${images.length} pages\n`); - - // Extract with dual-VLM consensus - const extracted = await extractWithConsensus(images); - console.log(`\nFinal: ${extracted.length} transactions`); - - // Compare results - const result = compareTransactions(extracted, expected); - console.log(`Accuracy: ${result.matches}/${result.total}`); - - if (result.errors.length > 0) { - console.log('Errors:'); - result.errors.forEach((e) => console.log(` - ${e}`)); - } - - // Assert high accuracy - const accuracy = result.matches / result.total; - expect(accuracy).toBeGreaterThan(0.95); - expect(extracted.length).toEqual(expected.length); - }); -} - -export default tap.start(); diff --git a/test/test.bankstatements.minicpm.ts b/test/test.bankstatements.minicpm.ts index 2343d0d..c3e9c50 100644 --- a/test/test.bankstatements.minicpm.ts +++ b/test/test.bankstatements.minicpm.ts @@ -1,10 +1,9 @@ /** * Bank statement extraction using MiniCPM-V (visual extraction) * - * Multi-query approach with thinking DISABLED for speed: - * 1. First ask how many transactions on each page - * 2. Then query each transaction individually - * Single pass, no consensus voting. + * JSON per-page approach: + * 1. Ask for structured JSON of all transactions per page + * 2. Consensus: extract twice, compare, retry if mismatch */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; @@ -14,7 +13,7 @@ import * as os from 'os'; import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; -const MODEL = 'minicpm-v:latest'; +const MODEL = 'openbmb/minicpm-v4.5:q8_0'; interface ITransaction { date: string; @@ -22,6 +21,22 @@ interface ITransaction { amount: number; } +const JSON_PROMPT = `Extract ALL transactions from this bank statement page as a JSON array. + +IMPORTANT RULES: +1. Each transaction has: date, description/counterparty, and an amount +2. Amount is NEGATIVE for money going OUT (debits, payments, withdrawals) +3. Amount is POSITIVE for money coming IN (credits, deposits, refunds) +4. Date format: YYYY-MM-DD +5. Do NOT include: opening balance, closing balance, subtotals, headers, or summary rows +6. Only include actual transactions with a specific date and amount + +Return ONLY this JSON format, no explanation: +[ + {"date": "2021-06-01", "counterparty": "COMPANY NAME", "amount": -25.99}, + {"date": "2021-06-02", "counterparty": "DEPOSIT FROM", "amount": 100.00} +]`; + /** * Convert PDF to PNG images using ImageMagick */ @@ -51,136 +66,320 @@ function convertPdfToImages(pdfPath: string): string[] { } /** - * Query MiniCPM-V with a prompt (thinking disabled for speed) + * Query for JSON extraction */ -async function queryVision(image: string, prompt: string): Promise { - const response = await fetch(`${OLLAMA_URL}/api/generate`, { +async function queryJson(image: string, queryId: string): Promise { + console.log(` [${queryId}] Sending request to ${MODEL}...`); + const startTime = Date.now(); + + const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: MODEL, - prompt: `/no_think\n${prompt}`, - images: [image], + messages: [{ + role: 'user', + content: JSON_PROMPT, + images: [image], + }], stream: false, options: { - num_predict: 500, + num_predict: 4000, temperature: 0.1, }, }), }); + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + if (!response.ok) { + console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`); throw new Error(`Ollama API error: ${response.status}`); } const data = await response.json(); - return (data.response || '').trim(); + const content = (data.message?.content || '').trim(); + console.log(` [${queryId}] Response received (${elapsed}s, ${content.length} chars)`); + return content; } /** - * Count transactions on a page + * Sanitize JSON string - fix common issues from vision model output */ -async function countTransactions(image: string, pageNum: number): Promise { - const response = await queryVision(image, - `Count the transaction rows in this bank statement table. -Each transaction has a date, description, and amount (debit or credit). -Do not count headers or totals. -How many transaction rows are there? Answer with just the number.` - ); +function sanitizeJson(jsonStr: string): string { + let s = jsonStr; - console.log(` [Page ${pageNum}] Count response: "${response}"`); - const match = response.match(/(\d+)/); - const count = match ? parseInt(match[1], 10) : 0; - console.log(` [Page ${pageNum}] Parsed count: ${count}`); - return count; + // Fix +number (e.g., +93.80 -> 93.80) - JSON doesn't allow + prefix + // Handle various whitespace patterns + s = s.replace(/"amount"\s*:\s*\+/g, '"amount": '); + s = s.replace(/:\s*\+(\d)/g, ': $1'); + + // Fix European number format with thousands separator (e.g., 1.000.00 -> 1000.00) + // Pattern: "amount": X.XXX.XX where X.XXX is thousands and .XX is decimal + s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4'); + // Also handle larger numbers like 10.000.00 + s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3$4.$5'); + + // Fix trailing commas before ] or } + s = s.replace(/,\s*([}\]])/g, '$1'); + + // Fix unescaped newlines inside strings (replace with space) + s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"'); + + // Fix unescaped tabs inside strings + s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"'); + + // Fix unescaped backslashes (but not already escaped ones) + s = s.replace(/\\(?!["\\/bfnrtu])/g, '\\\\'); + + // Fix common issues with counterparty names containing special chars + s = s.replace(/"counterparty":\s*"([^"]*)'([^"]*)"/g, '"counterparty": "$1$2"'); + + // Remove control characters except newlines (which we handle above) + s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' '); + + return s; } /** - * Get a single transaction by index (logs immediately) + * Parse JSON response into transactions */ -async function getTransaction(image: string, index: number, pageNum: number): Promise { - const response = await queryVision(image, - `Look at transaction row #${index} in the bank statement table (row 1 is the first transaction after the header). +function parseJsonResponse(response: string, queryId: string): ITransaction[] { + console.log(` [${queryId}] Parsing response...`); -Extract: -- DATE: in YYYY-MM-DD format -- COUNTERPARTY: the description/name -- AMOUNT: as a number (negative for debits like "- 21,47 €" = -21.47, positive for credits) + // Try to find JSON in markdown code block + const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/); + let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim(); -Format your answer as: DATE|COUNTERPARTY|AMOUNT -Example: 2024-01-15|Amazon|-25.99` - ); - - // Parse the response - const lines = response.split('\n').filter(l => l.includes('|')); - const line = lines[lines.length - 1] || response; - const parts = line.split('|').map(p => p.trim()); - - if (parts.length >= 3) { - // Parse amount - handle various formats - let amountStr = parts[2].replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); - // European format: comma is decimal - if (amountStr.includes(',')) { - amountStr = amountStr.replace(/\./g, '').replace(',', '.'); - } - const amount = parseFloat(amountStr) || 0; - - const tx = { - date: parts[0], - counterparty: parts[1], - amount: amount, - }; - // Log immediately as this transaction completes - console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] ${tx.date} | ${tx.counterparty.substring(0, 25).padEnd(25)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); - return tx; + if (codeBlockMatch) { + console.log(` [${queryId}] Found JSON in code block`); } - // Log raw response on parse failure - console.log(` [P${pageNum} Tx${index.toString().padStart(2, ' ')}] PARSE FAILED: "${response.replace(/\n/g, ' ').substring(0, 60)}..."`); - return null; + // Sanitize JSON (fix +number issue) + jsonStr = sanitizeJson(jsonStr); + + try { + const parsed = JSON.parse(jsonStr); + if (Array.isArray(parsed)) { + const txs = parsed.map(tx => ({ + date: String(tx.date || ''), + counterparty: String(tx.counterparty || tx.description || ''), + amount: parseAmount(tx.amount), + })); + console.log(` [${queryId}] Parsed ${txs.length} transactions (direct)`); + return txs; + } + console.log(` [${queryId}] Parsed JSON is not an array`); + } catch (e) { + const errMsg = (e as Error).message; + console.log(` [${queryId}] Direct parse failed: ${errMsg}`); + + // Log problematic section with context + const posMatch = errMsg.match(/position (\d+)/); + if (posMatch) { + const pos = parseInt(posMatch[1]); + const start = Math.max(0, pos - 40); + const end = Math.min(jsonStr.length, pos + 40); + const context = jsonStr.substring(start, end); + const marker = ' '.repeat(pos - start) + '^'; + console.log(` [${queryId}] Context around error position ${pos}:`); + console.log(` [${queryId}] ...${context}...`); + console.log(` [${queryId}] ${marker}`); + } + + // Try to find JSON array pattern + const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); + if (arrayMatch) { + console.log(` [${queryId}] Found array pattern, trying to parse...`); + const sanitizedArray = sanitizeJson(arrayMatch[0]); + try { + const parsed = JSON.parse(sanitizedArray); + if (Array.isArray(parsed)) { + const txs = parsed.map(tx => ({ + date: String(tx.date || ''), + counterparty: String(tx.counterparty || tx.description || ''), + amount: parseAmount(tx.amount), + })); + console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`); + return txs; + } + } catch (e2) { + const errMsg2 = (e2 as Error).message; + console.log(` [${queryId}] Array parse failed: ${errMsg2}`); + const posMatch2 = errMsg2.match(/position (\d+)/); + if (posMatch2) { + const pos2 = parseInt(posMatch2[1]); + console.log(` [${queryId}] Context around error: ...${sanitizedArray.substring(Math.max(0, pos2 - 30), pos2 + 30)}...`); + } + + // Try to extract individual objects from the malformed array + console.log(` [${queryId}] Attempting object-by-object extraction...`); + const extracted = extractTransactionsFromMalformedJson(sanitizedArray, queryId); + if (extracted.length > 0) { + console.log(` [${queryId}] Recovered ${extracted.length} transactions via object extraction`); + return extracted; + } + } + } else { + console.log(` [${queryId}] No array pattern found in response`); + console.log(` [${queryId}] Raw response preview: ${response.substring(0, 200)}...`); + } + } + + console.log(` [${queryId}] PARSE FAILED - returning empty array`); + return []; } /** - * Extract transactions from a single page using multi-query approach + * Extract transactions from malformed JSON by parsing objects individually + */ +function extractTransactionsFromMalformedJson(jsonStr: string, queryId: string): ITransaction[] { + const transactions: ITransaction[] = []; + + // Match individual transaction objects + const objectPattern = /\{\s*"date"\s*:\s*"([^"]+)"\s*,\s*"counterparty"\s*:\s*"([^"]+)"\s*,\s*"amount"\s*:\s*([+-]?\d+\.?\d*)\s*\}/g; + let match; + + while ((match = objectPattern.exec(jsonStr)) !== null) { + transactions.push({ + date: match[1], + counterparty: match[2], + amount: parseFloat(match[3]), + }); + } + + // Also try with different field orders (amount before counterparty, etc.) + if (transactions.length === 0) { + const altPattern = /\{\s*"date"\s*:\s*"([^"]+)"[^}]*"amount"\s*:\s*([+-]?\d+\.?\d*)[^}]*\}/g; + while ((match = altPattern.exec(jsonStr)) !== null) { + // Try to extract counterparty from the match + const counterpartyMatch = match[0].match(/"counterparty"\s*:\s*"([^"]+)"/); + const descMatch = match[0].match(/"description"\s*:\s*"([^"]+)"/); + transactions.push({ + date: match[1], + counterparty: counterpartyMatch?.[1] || descMatch?.[1] || 'UNKNOWN', + amount: parseFloat(match[2]), + }); + } + } + + return transactions; +} + +/** + * Parse amount from various formats + */ +function parseAmount(value: unknown): number { + if (typeof value === 'number') return value; + if (typeof value !== 'string') return 0; + + let s = value.replace(/[€$£\s]/g, '').replace('−', '-').replace('–', '-'); + // European format: comma is decimal + if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) { + s = s.replace(/\./g, '').replace(',', '.'); + } else { + s = s.replace(/,/g, ''); + } + return parseFloat(s) || 0; +} + +/** + * Compare two transaction arrays for consensus + */ +function transactionArraysMatch(a: ITransaction[], b: ITransaction[]): boolean { + if (a.length !== b.length) return false; + + for (let i = 0; i < a.length; i++) { + const dateMatch = a[i].date === b[i].date; + const amountMatch = Math.abs(a[i].amount - b[i].amount) < 0.01; + if (!dateMatch || !amountMatch) return false; + } + + return true; +} + +/** + * Compare two transaction arrays and log differences + */ +function compareAndLogDifferences(txs1: ITransaction[], txs2: ITransaction[], pageNum: number): void { + if (txs1.length !== txs2.length) { + console.log(` [Page ${pageNum}] Length mismatch: Q1=${txs1.length}, Q2=${txs2.length}`); + return; + } + + for (let i = 0; i < txs1.length; i++) { + const dateMatch = txs1[i].date === txs2[i].date; + const amountMatch = Math.abs(txs1[i].amount - txs2[i].amount) < 0.01; + + if (!dateMatch || !amountMatch) { + console.log(` [Page ${pageNum}] Tx ${i + 1} differs:`); + console.log(` Q1: ${txs1[i].date} | ${txs1[i].amount}`); + console.log(` Q2: ${txs2[i].date} | ${txs2[i].amount}`); + } + } +} + +/** + * Extract transactions from a single page with consensus */ async function extractTransactionsFromPage(image: string, pageNum: number): Promise { - // Step 1: Count transactions - const count = await countTransactions(image, pageNum); + const MAX_ATTEMPTS = 5; + console.log(`\n ======== Page ${pageNum} ========`); + console.log(` [Page ${pageNum}] Starting JSON extraction...`); - if (count === 0) { - return []; - } + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + console.log(`\n [Page ${pageNum}] --- Attempt ${attempt}/${MAX_ATTEMPTS} ---`); - // Step 2: Query each transaction (in batches to avoid overwhelming) - // Each transaction logs itself as it completes - const transactions: ITransaction[] = []; - const batchSize = 5; + // Extract twice in parallel + const q1Id = `P${pageNum}A${attempt}Q1`; + const q2Id = `P${pageNum}A${attempt}Q2`; - for (let start = 1; start <= count; start += batchSize) { - const end = Math.min(start + batchSize - 1, count); - const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i); + const [response1, response2] = await Promise.all([ + queryJson(image, q1Id), + queryJson(image, q2Id), + ]); - // Query batch in parallel - each logs as it completes - const results = await Promise.all( - indices.map(i => getTransaction(image, i, pageNum)) - ); + const txs1 = parseJsonResponse(response1, q1Id); + const txs2 = parseJsonResponse(response2, q2Id); - for (const tx of results) { - if (tx) { - transactions.push(tx); + console.log(` [Page ${pageNum}] Results: Q1=${txs1.length} txs, Q2=${txs2.length} txs`); + + if (txs1.length > 0 && transactionArraysMatch(txs1, txs2)) { + console.log(` [Page ${pageNum}] ✓ CONSENSUS REACHED: ${txs1.length} transactions`); + console.log(` [Page ${pageNum}] Transactions:`); + for (let i = 0; i < txs1.length; i++) { + const tx = txs1[i]; + console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); } + return txs1; + } + + console.log(` [Page ${pageNum}] ✗ NO CONSENSUS`); + compareAndLogDifferences(txs1, txs2, pageNum); + + if (attempt < MAX_ATTEMPTS) { + console.log(` [Page ${pageNum}] Retrying...`); } } - console.log(` [Page ${pageNum}] Complete: ${transactions.length}/${count} extracted`); - return transactions; + // Fallback: use last response + console.log(`\n [Page ${pageNum}] === FALLBACK (no consensus after ${MAX_ATTEMPTS} attempts) ===`); + const fallbackId = `P${pageNum}FALLBACK`; + const fallbackResponse = await queryJson(image, fallbackId); + const fallback = parseJsonResponse(fallbackResponse, fallbackId); + console.log(` [Page ${pageNum}] ~ FALLBACK RESULT: ${fallback.length} transactions`); + for (let i = 0; i < fallback.length; i++) { + const tx = fallback[i]; + console.log(` ${(i + 1).toString().padStart(2)}. ${tx.date} | ${tx.counterparty.substring(0, 30).padEnd(30)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`); + } + return fallback; } /** * Extract all transactions from bank statement */ async function extractTransactions(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with MiniCPM-V (multi-query, deep think)`); + console.log(` [Vision] Processing ${images.length} page(s) with ${MODEL} (JSON consensus)`); const allTransactions: ITransaction[] = []; @@ -199,8 +398,9 @@ async function extractTransactions(images: string[]): Promise { function compareTransactions( extracted: ITransaction[], expected: ITransaction[] -): { matches: number; total: number; errors: string[] } { +): { matches: number; total: number; errors: string[]; variations: string[] } { const errors: string[] = []; + const variations: string[] = []; let matches = 0; for (let i = 0; i < expected.length; i++) { @@ -217,6 +417,12 @@ function compareTransactions( if (dateMatch && amountMatch) { matches++; + // Track counterparty variations (date and amount match but name differs) + if (ext.counterparty !== exp.counterparty) { + variations.push( + `[${i}] "${exp.counterparty}" → "${ext.counterparty}"` + ); + } } else { errors.push( `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}` @@ -228,7 +434,7 @@ function compareTransactions( errors.push(`Extra transactions: ${extracted.length - expected.length}`); } - return { matches, total: expected.length, errors }; + return { matches, total: expected.length, errors, variations }; } /** @@ -294,18 +500,24 @@ for (const testCase of testCases) { console.log(` Extracted: ${extracted.length} transactions`); const result = compareTransactions(extracted, expected); - const accuracy = result.total > 0 ? result.matches / result.total : 0; + const perfectMatch = result.matches === result.total && extracted.length === expected.length; - if (accuracy >= 0.95 && extracted.length === expected.length) { + if (perfectMatch) { passedCount++; console.log(` Result: PASS (${result.matches}/${result.total})`); } else { failedCount++; console.log(` Result: FAIL (${result.matches}/${result.total})`); - result.errors.slice(0, 5).forEach((e) => console.log(` - ${e}`)); + result.errors.slice(0, 10).forEach((e) => console.log(` - ${e}`)); } - expect(accuracy).toBeGreaterThan(0.95); + // Log counterparty variations (names that differ but date/amount matched) + if (result.variations.length > 0) { + console.log(` Counterparty variations (${result.variations.length}):`); + result.variations.forEach((v) => console.log(` ${v}`)); + } + + expect(result.matches).toEqual(result.total); expect(extracted.length).toEqual(expected.length); }); } @@ -313,9 +525,9 @@ for (const testCase of testCases) { tap.test('summary', async () => { const total = testCases.length; console.log(`\n======================================================`); - console.log(` Bank Statement Summary (MiniCPM-V)`); + console.log(` Bank Statement Summary (${MODEL})`); console.log(`======================================================`); - console.log(` Method: Multi-query (no_think)`); + console.log(` Method: JSON per-page + consensus`); console.log(` Passed: ${passedCount}/${total}`); console.log(` Failed: ${failedCount}/${total}`); console.log(`======================================================\n`); diff --git a/test/test.bankstatements.ministral3.ts b/test/test.bankstatements.ministral3.ts deleted file mode 100644 index c309773..0000000 --- a/test/test.bankstatements.ministral3.ts +++ /dev/null @@ -1,348 +0,0 @@ -/** - * Bank Statement extraction using Ministral 3 Vision (Direct) - * - * NO OCR pipeline needed - Ministral 3 has built-in vision encoder: - * 1. Convert PDF to images - * 2. Send images directly to Ministral 3 via Ollama - * 3. Extract transactions as structured JSON - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensureMinistral3 } from './helpers/docker.js'; - -const OLLAMA_URL = 'http://localhost:11434'; -const VISION_MODEL = 'ministral-3:8b'; - -interface ITransaction { - date: string; - counterparty: string; - amount: number; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - execSync( - `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Extract transactions from a single page image using Ministral 3 Vision - */ -async function extractTransactionsFromPage(image: string, pageNum: number): Promise { - console.log(` [Vision] Processing page ${pageNum}`); - - // JSON schema for array of transactions - const transactionSchema = { - type: 'array', - items: { - type: 'object', - properties: { - date: { type: 'string', description: 'Transaction date in YYYY-MM-DD format' }, - counterparty: { type: 'string', description: 'Name of the other party' }, - amount: { type: 'number', description: 'Amount (negative for debits, positive for credits)' }, - }, - required: ['date', 'counterparty', 'amount'], - }, - }; - - const prompt = `Extract ALL bank transactions from this bank statement page. - -For each transaction, extract: -- date: Transaction date in YYYY-MM-DD format -- counterparty: The name/description of the other party (merchant, payee, etc.) -- amount: The amount as a number (NEGATIVE for debits/expenses, POSITIVE for credits/income) - -Return a JSON array of transactions. If no transactions visible, return empty array []. -Example: [{"date":"2021-06-01","counterparty":"AMAZON","amount":-50.00}]`; - - const response = await fetch(`${OLLAMA_URL}/api/chat`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - model: VISION_MODEL, - messages: [ - { - role: 'user', - content: prompt, - images: [image], - }, - ], - format: transactionSchema, - stream: true, - options: { - num_predict: 4096, // Bank statements can have many transactions - temperature: 0.0, - }, - }), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.message?.content) { - fullText += json.message.content; - } - } catch { - // Skip invalid JSON lines - } - } - } - - // Parse JSON response - let jsonStr = fullText.trim(); - - if (jsonStr.startsWith('```json')) jsonStr = jsonStr.slice(7); - else if (jsonStr.startsWith('```')) jsonStr = jsonStr.slice(3); - if (jsonStr.endsWith('```')) jsonStr = jsonStr.slice(0, -3); - jsonStr = jsonStr.trim(); - - // Find array boundaries - const startIdx = jsonStr.indexOf('['); - const endIdx = jsonStr.lastIndexOf(']') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - console.log(` [Page ${pageNum}] No transactions found`); - return []; - } - - try { - const parsed = JSON.parse(jsonStr.substring(startIdx, endIdx)); - console.log(` [Page ${pageNum}] Found ${parsed.length} transactions`); - return parsed.map((t: { date?: string; counterparty?: string; amount?: number }) => ({ - date: t.date || '', - counterparty: t.counterparty || '', - amount: parseFloat(String(t.amount)) || 0, - })); - } catch (e) { - console.log(` [Page ${pageNum}] Parse error: ${e}`); - return []; - } -} - -/** - * Extract all transactions from all pages - */ -async function extractAllTransactions(images: string[]): Promise { - const allTransactions: ITransaction[] = []; - - for (let i = 0; i < images.length; i++) { - const pageTransactions = await extractTransactionsFromPage(images[i], i + 1); - allTransactions.push(...pageTransactions); - } - - return allTransactions; -} - -/** - * Normalize date to YYYY-MM-DD - */ -function normalizeDate(dateStr: string): string { - if (!dateStr) return ''; - if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr; - - // Handle DD/MM/YYYY or DD.MM.YYYY - const match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); - if (match) { - return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`; - } - - return dateStr; -} - -/** - * Compare extracted transactions vs expected - */ -function compareTransactions( - extracted: ITransaction[], - expected: ITransaction[] -): { matchRate: number; matched: number; missed: number; extra: number; errors: string[] } { - const errors: string[] = []; - let matched = 0; - - // Normalize all dates - const normalizedExtracted = extracted.map((t) => ({ - ...t, - date: normalizeDate(t.date), - counterparty: t.counterparty.toUpperCase().trim(), - })); - - const normalizedExpected = expected.map((t) => ({ - ...t, - date: normalizeDate(t.date), - counterparty: t.counterparty.toUpperCase().trim(), - })); - - // Try to match each expected transaction - const matchedIndices = new Set(); - - for (const exp of normalizedExpected) { - let found = false; - - for (let i = 0; i < normalizedExtracted.length; i++) { - if (matchedIndices.has(i)) continue; - - const ext = normalizedExtracted[i]; - - // Match by date + amount (counterparty names can vary) - if (ext.date === exp.date && Math.abs(ext.amount - exp.amount) < 0.02) { - matched++; - matchedIndices.add(i); - found = true; - break; - } - } - - if (!found) { - errors.push(`Missing: ${exp.date} | ${exp.counterparty} | ${exp.amount}`); - } - } - - const missed = expected.length - matched; - const extra = extracted.length - matched; - const matchRate = expected.length > 0 ? (matched / expected.length) * 100 : 0; - - return { matchRate, matched, missed, extra, errors }; -} - -/** - * Find test cases (PDF + JSON pairs in .nogit/) - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit'); - if (!fs.existsSync(testDir)) return []; - - const files = fs.readdirSync(testDir); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of files.filter((f) => f.endsWith('.pdf'))) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - // Skip invoice files - only bank statements - if (!baseName.includes('invoice')) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - } - - return testCases.sort((a, b) => a.name.localeCompare(b.name)); -} - -// Tests - -tap.test('setup: ensure Ministral 3 is running', async () => { - console.log('\n[Setup] Checking Ministral 3...\n'); - const ok = await ensureMinistral3(); - expect(ok).toBeTrue(); - console.log('\n[Setup] Ready!\n'); -}); - -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} bank statement test cases (Ministral 3 Vision)\n`); - -let totalMatched = 0; -let totalExpected = 0; -const times: number[] = []; - -for (const testCase of testCases) { - tap.test(`should extract bank statement: ${testCase.name}`, async () => { - const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.length} transactions`); - - const start = Date.now(); - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - - const extracted = await extractAllTransactions(images); - const elapsed = Date.now() - start; - times.push(elapsed); - - console.log(` Extracted: ${extracted.length} transactions`); - - const result = compareTransactions(extracted, expected); - totalMatched += result.matched; - totalExpected += expected.length; - - console.log(` Match rate: ${result.matchRate.toFixed(1)}% (${result.matched}/${expected.length})`); - console.log(` Missed: ${result.missed}, Extra: ${result.extra}`); - console.log(` Time: ${(elapsed / 1000).toFixed(1)}s`); - - if (result.errors.length > 0 && result.errors.length <= 5) { - result.errors.forEach((e) => console.log(` - ${e}`)); - } else if (result.errors.length > 5) { - console.log(` (${result.errors.length} missing transactions)`); - } - - // Consider it a pass if we match at least 70% of transactions - expect(result.matchRate).toBeGreaterThan(70); - }); -} - -tap.test('summary', async () => { - const overallMatchRate = totalExpected > 0 ? (totalMatched / totalExpected) * 100 : 0; - const totalTime = times.reduce((a, b) => a + b, 0) / 1000; - const avgTime = times.length > 0 ? totalTime / times.length : 0; - - console.log(`\n======================================================`); - console.log(` Bank Statement Extraction Summary (Ministral 3)`); - console.log(`======================================================`); - console.log(` Method: Ministral 3 8B Vision (Direct)`); - console.log(` Statements: ${testCases.length}`); - console.log(` Matched: ${totalMatched}/${totalExpected} transactions`); - console.log(` Match rate: ${overallMatchRate.toFixed(1)}%`); - console.log(`------------------------------------------------------`); - console.log(` Total time: ${totalTime.toFixed(1)}s`); - console.log(` Avg per stmt: ${avgTime.toFixed(1)}s`); - console.log(`======================================================\n`); -}); - -export default tap.start(); diff --git a/test/test.bankstatements.paddleocr-vl.ts b/test/test.bankstatements.paddleocr-vl.ts deleted file mode 100644 index 873e998..0000000 --- a/test/test.bankstatements.paddleocr-vl.ts +++ /dev/null @@ -1,346 +0,0 @@ -/** - * Bank statement extraction test using PaddleOCR-VL Full Pipeline - * - * This tests the complete PaddleOCR-VL pipeline for bank statements: - * 1. PP-DocLayoutV2 for layout detection - * 2. PaddleOCR-VL for recognition (tables with proper structure) - * 3. Structured Markdown output with tables - * 4. MiniCPM extracts transactions from structured tables - * - * The structured Markdown has properly formatted tables, - * making it much easier for MiniCPM to extract transaction data. - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; - -const PADDLEOCR_VL_URL = 'http://localhost:8000'; -const OLLAMA_URL = 'http://localhost:11434'; -const MINICPM_MODEL = 'minicpm-v:latest'; - -interface ITransaction { - date: string; - counterparty: string; - amount: number; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - execSync( - `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown) - */ -async function parseDocument(imageBase64: string): Promise { - const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - image: imageBase64, - output_format: 'markdown', - }), - }); - - if (!response.ok) { - const text = await response.text(); - throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); - } - - const data = await response.json(); - - if (!data.success) { - throw new Error(`PaddleOCR-VL error: ${data.error}`); - } - - return data.result?.markdown || ''; -} - -/** - * Extract transactions from structured Markdown using MiniCPM - */ -async function extractTransactionsFromMarkdown(markdown: string): Promise { - console.log(` [Extract] Processing ${markdown.length} chars of Markdown`); - - const prompt = `/nothink -Convert this bank statement to a JSON array of transactions. - -Read the Amount values carefully: -- "- 21,47 €" means DEBIT, output as: -21.47 -- "+ 1.000,00 €" means CREDIT, output as: 1000.00 -- European format: comma = decimal point, dot = thousands - -For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} - -Return ONLY the JSON array, no explanation. - -Document: -${markdown}`; - - const payload = { - model: MINICPM_MODEL, - prompt, - stream: true, - options: { - num_predict: 16384, - temperature: 0.1, - }, - }; - - const response = await fetch(`${OLLAMA_URL}/api/generate`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.response) { - fullText += json.response; - } - } catch { - // Skip invalid JSON lines - } - } - } - - // Extract JSON array from response - const startIdx = fullText.indexOf('['); - const endIdx = fullText.lastIndexOf(']') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON array found in response: ${fullText.substring(0, 200)}`); - } - - const jsonStr = fullText.substring(startIdx, endIdx); - return JSON.parse(jsonStr); -} - -/** - * Extract transactions from all pages of a bank statement - */ -async function extractAllTransactions(images: string[]): Promise { - const allTransactions: ITransaction[] = []; - - for (let i = 0; i < images.length; i++) { - console.log(` Processing page ${i + 1}/${images.length}...`); - - // Parse with full pipeline - const markdown = await parseDocument(images[i]); - console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); - - // Extract transactions - try { - const transactions = await extractTransactionsFromMarkdown(markdown); - console.log(` [Extracted] ${transactions.length} transactions`); - allTransactions.push(...transactions); - } catch (err) { - console.log(` [Error] ${err}`); - } - } - - return allTransactions; -} - -/** - * Compare transactions - find matching transaction in expected list - */ -function findMatchingTransaction( - tx: ITransaction, - expectedList: ITransaction[] -): ITransaction | undefined { - return expectedList.find((exp) => { - const dateMatch = tx.date === exp.date; - const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; - const counterpartyMatch = - tx.counterparty?.toLowerCase().includes(exp.counterparty?.toLowerCase().slice(0, 10)) || - exp.counterparty?.toLowerCase().includes(tx.counterparty?.toLowerCase().slice(0, 10)); - return dateMatch && amountMatch && counterpartyMatch; - }); -} - -/** - * Calculate extraction accuracy - */ -function calculateAccuracy( - extracted: ITransaction[], - expected: ITransaction[] -): { matched: number; total: number; accuracy: number } { - let matched = 0; - const usedExpected = new Set(); - - for (const tx of extracted) { - for (let i = 0; i < expected.length; i++) { - if (usedExpected.has(i)) continue; - - const exp = expected[i]; - const dateMatch = tx.date === exp.date; - const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; - - if (dateMatch && amountMatch) { - matched++; - usedExpected.add(i); - break; - } - } - } - - return { - matched, - total: expected.length, - accuracy: expected.length > 0 ? (matched / expected.length) * 100 : 0, - }; -} - -/** - * Find all test cases (PDF + JSON pairs) in .nogit/bankstatements/ - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit/bankstatements'); - if (!fs.existsSync(testDir)) { - return []; - } - - const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of pdfFiles) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - - testCases.sort((a, b) => a.name.localeCompare(b.name)); - return testCases; -} - -// Tests - -tap.test('setup: ensure Docker containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); - - // Ensure PaddleOCR-VL Full Pipeline is running - const paddleOk = await ensurePaddleOcrVlFull(); - expect(paddleOk).toBeTrue(); - - // Ensure MiniCPM is running (for field extraction from Markdown) - const minicpmOk = await ensureMiniCpm(); - expect(minicpmOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); -}); - -// Dynamic test for each PDF/JSON pair -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} bank statement test cases (PaddleOCR-VL Full Pipeline)\n`); - -const results: Array<{ name: string; accuracy: number; matched: number; total: number }> = []; - -for (const testCase of testCases) { - tap.test(`should extract bank statement: ${testCase.name}`, async () => { - // Load expected data - const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.length} transactions`); - - const startTime = Date.now(); - - // Convert PDF to images - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - - // Extract all transactions - const extracted = await extractAllTransactions(images); - - const endTime = Date.now(); - const elapsedMs = endTime - startTime; - - // Calculate accuracy - const accuracy = calculateAccuracy(extracted, expected); - results.push({ - name: testCase.name, - accuracy: accuracy.accuracy, - matched: accuracy.matched, - total: accuracy.total, - }); - - console.log(` Extracted: ${extracted.length} transactions`); - console.log(` Matched: ${accuracy.matched}/${accuracy.total} (${accuracy.accuracy.toFixed(1)}%)`); - console.log(` Time: ${(elapsedMs / 1000).toFixed(1)}s`); - - // We expect at least 50% accuracy - expect(accuracy.accuracy).toBeGreaterThan(50); - }); -} - -tap.test('summary', async () => { - const totalStatements = results.length; - const avgAccuracy = - results.length > 0 ? results.reduce((a, b) => a + b.accuracy, 0) / results.length : 0; - const totalMatched = results.reduce((a, b) => a + b.matched, 0); - const totalExpected = results.reduce((a, b) => a + b.total, 0); - - console.log(`\n======================================================`); - console.log(` Bank Statement Extraction Summary (PaddleOCR-VL Full)`); - console.log(`======================================================`); - console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); - console.log(` Statements: ${totalStatements}`); - console.log(` Transactions: ${totalMatched}/${totalExpected} matched`); - console.log(` Avg accuracy: ${avgAccuracy.toFixed(1)}%`); - console.log(`======================================================\n`); -}); - -export default tap.start(); diff --git a/test/test.invoices.combined.ts b/test/test.invoices.combined.ts deleted file mode 100644 index 9e6bf70..0000000 --- a/test/test.invoices.combined.ts +++ /dev/null @@ -1,455 +0,0 @@ -/** - * Invoice extraction test using MiniCPM-V (visual) + PaddleOCR-VL (OCR augmentation) - * - * This is the combined approach that uses both models for best accuracy: - * - MiniCPM-V for visual understanding - * - PaddleOCR-VL for OCR text to augment prompts - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js'; - -const OLLAMA_URL = 'http://localhost:11434'; -const MODEL = 'minicpm-v:latest'; -const PADDLEOCR_VL_URL = 'http://localhost:8000'; - -interface IInvoice { - invoice_number: string; - invoice_date: string; - vendor_name: string; - currency: string; - net_amount: number; - vat_amount: number; - total_amount: number; -} - -/** - * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API) - */ -async function extractOcrText(imageBase64: string): Promise { - try { - const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - model: 'paddleocr-vl', - messages: [{ - role: 'user', - content: [ - { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } }, - { type: 'text', text: 'OCR:' } - ] - }], - temperature: 0.0, - max_tokens: 4096 - }), - }); - - if (!response.ok) return ''; - - const data = await response.json(); - return data.choices?.[0]?.message?.content || ''; - } catch { - // PaddleOCR-VL unavailable - } - return ''; -} - -/** - * Build prompt with optional OCR text - */ -function buildPrompt(ocrText: string): string { - const base = `/nothink -You are an invoice parser. Extract the following fields from this invoice: - -1. invoice_number: The invoice/receipt number -2. invoice_date: Date in YYYY-MM-DD format -3. vendor_name: Company that issued the invoice -4. currency: EUR, USD, etc. -5. net_amount: Amount before tax (if shown) -6. vat_amount: Tax/VAT amount (if shown, 0 if reverse charge or no tax) -7. total_amount: Final amount due - -Return ONLY valid JSON in this exact format: -{"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company Name","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00} - -If a field is not visible, use null for strings or 0 for numbers. -No explanation, just the JSON object.`; - - if (ocrText) { - // Limit OCR text to prevent context overflow - const maxOcrLength = 4000; - const truncatedOcr = ocrText.length > maxOcrLength - ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)' - : ocrText; - - return `${base} - -OCR text extracted from the invoice (use for reference): ---- -${truncatedOcr} ---- - -Cross-reference the image with the OCR text above for accuracy.`; - } - return base; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - execSync( - `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Single extraction pass - */ -async function extractOnce(images: string[], passNum: number, ocrText: string = ''): Promise { - const payload = { - model: MODEL, - prompt: buildPrompt(ocrText), - images, - stream: true, - options: { - num_predict: 2048, - temperature: 0.1, - }, - }; - - const response = await fetch(`${OLLAMA_URL}/api/generate`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(payload), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.response) { - fullText += json.response; - } - } catch { - // Skip invalid JSON lines - } - } - } - - // Extract JSON from response - const startIdx = fullText.indexOf('{'); - const endIdx = fullText.lastIndexOf('}') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); - } - - const jsonStr = fullText.substring(startIdx, endIdx); - return JSON.parse(jsonStr); -} - -/** - * Create a hash of invoice for comparison (using key fields) - */ -function hashInvoice(invoice: IInvoice): string { - return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; -} - -/** - * Extract with majority voting - run until 2 passes match - * Optimization: Run Pass 1, OCR, and Pass 2 (after OCR) in parallel - */ -async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { - const results: Array<{ invoice: IInvoice; hash: string }> = []; - const hashCounts: Map = new Map(); - - const addResult = (invoice: IInvoice, passLabel: string): number => { - const hash = hashInvoice(invoice); - results.push({ invoice, hash }); - hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); - console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); - return hashCounts.get(hash)!; - }; - - // OPTIMIZATION: Run Pass 1 (no OCR) in parallel with OCR -> Pass 2 (with OCR) - let ocrText = ''; - const pass1Promise = extractOnce(images, 1, '').catch((err) => ({ error: err })); - - // OCR then immediately Pass 2 - const ocrThenPass2Promise = (async () => { - ocrText = await extractOcrText(images[0]); - if (ocrText) { - console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`); - } - return extractOnce(images, 2, ocrText).catch((err) => ({ error: err })); - })(); - - // Wait for both to complete - const [pass1Result, pass2Result] = await Promise.all([pass1Promise, ocrThenPass2Promise]); - - // Process Pass 1 result - if ('error' in pass1Result) { - console.log(` [Pass 1] Error: ${(pass1Result as {error: unknown}).error}`); - } else { - const count = addResult(pass1Result as IInvoice, 'Pass 1'); - if (count >= 2) { - console.log(` [Consensus] Reached after parallel passes`); - return pass1Result as IInvoice; - } - } - - // Process Pass 2 result - if ('error' in pass2Result) { - console.log(` [Pass 2+OCR] Error: ${(pass2Result as {error: unknown}).error}`); - } else { - const count = addResult(pass2Result as IInvoice, 'Pass 2+OCR'); - if (count >= 2) { - console.log(` [Consensus] Reached after parallel passes`); - return pass2Result as IInvoice; - } - } - - // Continue with passes 3+ using OCR text if no consensus yet - for (let pass = 3; pass <= maxPasses; pass++) { - try { - const invoice = await extractOnce(images, pass, ocrText); - const count = addResult(invoice, `Pass ${pass}+OCR`); - - if (count >= 2) { - console.log(` [Consensus] Reached after ${pass} passes`); - return invoice; - } - } catch (err) { - console.log(` [Pass ${pass}] Error: ${err}`); - } - } - - // No consensus reached - return the most common result - let bestHash = ''; - let bestCount = 0; - for (const [hash, count] of hashCounts) { - if (count > bestCount) { - bestCount = count; - bestHash = hash; - } - } - - if (!bestHash) { - throw new Error(`No valid results for ${invoiceName}`); - } - - const best = results.find((r) => r.hash === bestHash)!; - console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); - return best.invoice; -} - -/** - * Compare extracted invoice against expected - */ -function compareInvoice( - extracted: IInvoice, - expected: IInvoice -): { match: boolean; errors: string[] } { - const errors: string[] = []; - - // Compare invoice number (normalize by removing spaces and case) - const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - if (extNum !== expNum) { - errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); - } - - // Compare date - if (extracted.invoice_date !== expected.invoice_date) { - errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); - } - - // Compare total amount (with tolerance) - if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { - errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); - } - - // Compare currency - if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { - errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); - } - - return { match: errors.length === 0, errors }; -} - -/** - * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ - * Priority invoices (like vodafone) run first for quick feedback - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit/invoices'); - if (!fs.existsSync(testDir)) { - return []; - } - - const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of pdfFiles) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - - // Sort with priority invoices first, then alphabetically - const priorityPrefixes = ['vodafone']; - testCases.sort((a, b) => { - const aPriority = priorityPrefixes.findIndex((p) => a.name.startsWith(p)); - const bPriority = priorityPrefixes.findIndex((p) => b.name.startsWith(p)); - - // Both have priority - sort by priority order - if (aPriority >= 0 && bPriority >= 0) return aPriority - bPriority; - // Only a has priority - a comes first - if (aPriority >= 0) return -1; - // Only b has priority - b comes first - if (bPriority >= 0) return 1; - // Neither has priority - alphabetical - return a.name.localeCompare(b.name); - }); - - return testCases; -} - -// Tests - -tap.test('setup: ensure Docker containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); - - // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU) - const paddleOk = await ensurePaddleOcrVl(); - expect(paddleOk).toBeTrue(); - - // Ensure MiniCPM is running - const minicpmOk = await ensureMiniCpm(); - expect(minicpmOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); -}); - -tap.test('should have MiniCPM-V 4.5 model loaded', async () => { - const response = await fetch(`${OLLAMA_URL}/api/tags`); - const data = await response.json(); - const modelNames = data.models.map((m: { name: string }) => m.name); - expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); -}); - -// Dynamic test for each PDF/JSON pair -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} invoice test cases\n`); - -let passedCount = 0; -let failedCount = 0; -const processingTimes: number[] = []; - -for (const testCase of testCases) { - tap.test(`should extract invoice: ${testCase.name}`, async () => { - // Load expected data - const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); - - const startTime = Date.now(); - - // Convert PDF to images - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - - // Extract with consensus voting - const extracted = await extractWithConsensus(images, testCase.name); - - const endTime = Date.now(); - const elapsedMs = endTime - startTime; - processingTimes.push(elapsedMs); - - // Compare results - const result = compareInvoice(extracted, expected); - - if (result.match) { - passedCount++; - console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); - } else { - failedCount++; - console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); - result.errors.forEach((e) => console.log(` - ${e}`)); - } - - // Assert match - expect(result.match).toBeTrue(); - }); -} - -tap.test('summary', async () => { - const totalInvoices = testCases.length; - const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; - const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); - const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; - const avgTimeSec = avgTimeMs / 1000; - const totalTimeSec = totalTimeMs / 1000; - - console.log(`\n========================================`); - console.log(` Invoice Extraction Summary`); - console.log(`========================================`); - console.log(` Passed: ${passedCount}/${totalInvoices}`); - console.log(` Failed: ${failedCount}/${totalInvoices}`); - console.log(` Accuracy: ${accuracy.toFixed(1)}%`); - console.log(`----------------------------------------`); - console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); - console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); - console.log(`========================================\n`); -}); - -export default tap.start(); diff --git a/test/test.invoices.minicpm.ts b/test/test.invoices.minicpm.ts index 0c190d9..fc2af81 100644 --- a/test/test.invoices.minicpm.ts +++ b/test/test.invoices.minicpm.ts @@ -1,8 +1,10 @@ /** - * Invoice extraction test using MiniCPM-V only (visual extraction) + * Invoice extraction test using MiniCPM-V (visual extraction) * - * Multi-query approach with thinking DISABLED for speed. - * Single pass, no consensus voting. + * Consensus approach: + * 1. Pass 1: Fast JSON extraction + * 2. Pass 2: Confirm with thinking enabled + * 3. If mismatch: repeat until consensus or max attempts */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; @@ -12,7 +14,7 @@ import * as os from 'os'; import { ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; -const MODEL = 'minicpm-v:latest'; +const MODEL = 'openbmb/minicpm-v4.5:q8_0'; interface IInvoice { invoice_number: string; @@ -33,7 +35,7 @@ function convertPdfToImages(pdfPath: string): string[] { try { execSync( - `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + `convert -density 300 -quality 95 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, { stdio: 'pipe' } ); @@ -52,20 +54,35 @@ function convertPdfToImages(pdfPath: string): string[] { } } +const JSON_PROMPT = `Extract invoice data from this image. Return ONLY a JSON object with these exact fields: +{ + "invoice_number": "the invoice number (not VAT ID, not customer ID)", + "invoice_date": "YYYY-MM-DD format", + "vendor_name": "company that issued the invoice", + "currency": "EUR, USD, or GBP", + "net_amount": 0.00, + "vat_amount": 0.00, + "total_amount": 0.00 +} +Return only the JSON, no explanation.`; + /** - * Query MiniCPM-V for a single field (thinking disabled for speed) + * Query MiniCPM-V for JSON output (fast, no thinking) */ -async function queryField(images: string[], question: string): Promise { - const response = await fetch(`${OLLAMA_URL}/api/generate`, { +async function queryJsonFast(images: string[]): Promise { + const response = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: MODEL, - prompt: `/no_think\n${question}`, - images: images, + messages: [{ + role: 'user', + content: JSON_PROMPT, + images: images, + }], stream: false, options: { - num_predict: 500, + num_predict: 1000, temperature: 0.1, }, }), @@ -76,263 +93,223 @@ async function queryField(images: string[], question: string): Promise { } const data = await response.json(); - const content = (data.response || '').trim(); - - // Return full content (no thinking to filter) - return content; + return (data.message?.content || '').trim(); } /** - * Extract invoice data using multiple queries with validation + * Query MiniCPM-V for JSON output with thinking enabled (slower, more accurate) */ -async function extractInvoiceFromImages(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with MiniCPM-V (multi-query + validation)`); +async function queryJsonWithThinking(images: string[]): Promise { + const response = await fetch(`${OLLAMA_URL}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: MODEL, + messages: [{ + role: 'user', + content: `Think carefully about this invoice image, then ${JSON_PROMPT}`, + images: images, + }], + stream: false, + options: { + num_predict: 2000, + temperature: 0.1, + }, + }), + }); - // Log each result as it comes in - const queryAndLog = async (name: string, question: string): Promise => { - const result = await queryField(images, question); - console.log(` [Query] ${name}: "${result}"`); - return result; - }; + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } - // STRATEGY 1: List-then-pick for invoice number (avoids confusion with VAT/customer IDs) - // Also ask for invoice number directly as backup - const [allNumbers, directInvoiceNum] = await Promise.all([ - queryAndLog('All Numbers ', `List ALL document numbers visible on this invoice. -For each number, identify what type it is. -Format: type:number, type:number -Example: >>>invoice:R0014359508, vat:DE123456789, customer:K001234<<<`), - queryAndLog('Invoice # Dir ', `What is the INVOICE NUMBER (Rechnungsnummer)? -NOT the VAT number (starts with DE/IE), NOT customer ID. -Look for "Invoice No.", "Rechnungsnr.", "Invoice #", or "Facture". -For Adobe: starts with IEE or R followed by digits. -Return ONLY the number: >>>IEE2022006460244<<<`), - ]); + const data = await response.json(); + return (data.message?.content || '').trim(); +} - // STRATEGY 2: Query each field with >>> <<< delimiters - const [invoiceDate, invoiceDateAlt, vendor, currency, totalAmount, netAmount, vatAmount] = await Promise.all([ - queryAndLog('Invoice Date ', `Find the INVOICE DATE (when issued, NOT due date). -Look for: "Invoice Date", "Rechnungsdatum", "Date", "Datum" -Return ONLY the date in YYYY-MM-DD format: >>>2024-01-15<<<`), +/** + * Parse amount from string (handles European format) + */ +function parseAmount(s: string | number | undefined): number { + if (s === undefined || s === null) return 0; + if (typeof s === 'number') return s; + const match = s.match(/([\d.,]+)/); + if (!match) return 0; + const numStr = match[1]; + // Handle European format: 1.234,56 → 1234.56 + const normalized = numStr.includes(',') && numStr.indexOf(',') > numStr.lastIndexOf('.') + ? numStr.replace(/\./g, '').replace(',', '.') + : numStr.replace(/,/g, ''); + return parseFloat(normalized) || 0; +} - // STRATEGY 3: Ask same question differently for verification - queryAndLog('Date Alt ', `What date appears next to the invoice number at the top? -Return ONLY YYYY-MM-DD format: >>>2024-01-15<<<`), +/** + * Extract invoice number from potentially verbose response + */ +function extractInvoiceNumber(s: string | undefined): string { + if (!s) return ''; + let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); + const patterns = [ + /\b([A-Z]{2,3}\d{10,})\b/i, // IEE2022006460244 + /\b([A-Z]\d{8,})\b/i, // R0014359508 + /\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i, // INV-2024-001 + /\b(\d{7,})\b/, // 1579087430 + ]; + for (const pattern of patterns) { + const match = clean.match(pattern); + if (match) return match[1]; + } + return clean.replace(/[^A-Z0-9-]/gi, '').trim() || clean; +} - queryAndLog('Vendor ', `What company ISSUED this invoice (seller, not buyer)? -Look at letterhead/logo at top. -Return ONLY the company name: >>>Adobe Inc.<<<`), +/** + * Extract date (YYYY-MM-DD) from response + */ +function extractDate(s: string | undefined): string { + if (!s) return ''; + let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); + const isoMatch = clean.match(/(\d{4}-\d{2}-\d{2})/); + if (isoMatch) return isoMatch[1]; + // Try DD/MM/YYYY or DD.MM.YYYY + const dmyMatch = clean.match(/(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/); + if (dmyMatch) { + return `${dmyMatch[3]}-${dmyMatch[2].padStart(2, '0')}-${dmyMatch[1].padStart(2, '0')}`; + } + return clean.replace(/[^\d-]/g, '').trim(); +} - queryAndLog('Currency ', `What currency symbol appears next to amounts? € $ or £? -Return the 3-letter code: >>>EUR<<<`), +/** + * Extract currency + */ +function extractCurrency(s: string | undefined): string { + if (!s) return 'EUR'; + const upper = s.toUpperCase(); + if (upper.includes('EUR') || upper.includes('€')) return 'EUR'; + if (upper.includes('USD') || upper.includes('$')) return 'USD'; + if (upper.includes('GBP') || upper.includes('£')) return 'GBP'; + return 'EUR'; +} - queryAndLog('Total Amount ', `What is the FINAL TOTAL amount (including tax) the customer must pay? -Look for "Total", "Grand Total", "Gesamtbetrag" at the bottom. -Return ONLY the number (no symbol): >>>24.99<<<`), +/** + * Extract JSON from response (handles markdown code blocks) + */ +function extractJsonFromResponse(response: string): Record | null { + // Try to find JSON in markdown code block + const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/); + const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim(); - queryAndLog('Net Amount ', `What is the NET/subtotal amount BEFORE tax? -Look for "Net", "Netto", "Subtotal". -Return ONLY the number: >>>20.99<<<`), - - queryAndLog('VAT Amount ', `What is the VAT/tax amount? -Look for "VAT", "MwSt", "USt", "Tax". -Return ONLY the number: >>>4.00<<<`), - ]); - - // Extract value from >>> <<< delimiters, or return original if not found - const extractDelimited = (s: string): string => { - const match = s.match(/>>>([^<]+)<< { - if (!s) return 0; - - // First try delimited format - const delimitedMatch = s.match(/>>>([^<]+)<< numStr.lastIndexOf('.') - ? numStr.replace(/\./g, '').replace(',', '.') - : numStr.replace(/,/g, ''); - return parseFloat(normalized) || 0; + try { + return JSON.parse(jsonStr); + } catch { + // Try to find JSON object pattern + const jsonMatch = jsonStr.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + return JSON.parse(jsonMatch[0]); + } catch { + return null; } } - - // Try to find amount patterns in prose: "24.99", "24,99", "€24.99", "24.99 EUR" - const amountPatterns = [ - /(?:€|EUR|USD|GBP)\s*([\d.,]+)/i, // €24.99 or EUR 24.99 - /([\d.,]+)\s*(?:€|EUR|USD|GBP)/i, // 24.99 EUR or 24.99€ - /(?:is|amount|total)[:\s]+([\d.,]+)/i, // "is 24.99" or "amount: 24.99" - /\b(\d{1,3}(?:[.,]\d{2,3})*(?:[.,]\d{2}))\b/, // General number pattern with decimals - ]; - - for (const pattern of amountPatterns) { - const match = s.match(pattern); - if (match) { - const numStr = match[1]; - // European format: 1.234,56 → 1234.56 - const normalized = numStr.includes(',') && numStr.indexOf(',') > numStr.lastIndexOf('.') - ? numStr.replace(/\./g, '').replace(',', '.') - : numStr.replace(/,/g, ''); - const value = parseFloat(normalized); - if (value > 0) return value; - } - } - - return 0; - }; - - // STRATEGY 1: Parse "all numbers" to find invoice number - const extractInvoiceFromList = (allNums: string): string | null => { - const delimited = extractDelimited(allNums); - - // Find ALL "invoice:XXX" matches - const invoiceMatches = delimited.matchAll(/invoice[:\s]*([A-Z0-9-]+)/gi); - const candidates: string[] = []; - for (const match of invoiceMatches) { - const value = match[1]; - // Filter out labels like "USt-IdNr", "INVOICE", short strings - if (value.length > 5 && /\d{4,}/.test(value) && !/^(ust|vat|tax|nr|id|no)/i.test(value)) { - candidates.push(value); - } - } - if (candidates.length > 0) return candidates[0]; - - // Look for "rechnungsnr:XXX" pattern - const rechnungMatch = delimited.match(/rechnung[snr]*[:\s]*([A-Z0-9-]{6,})/i); - if (rechnungMatch && /\d{4,}/.test(rechnungMatch[1])) return rechnungMatch[1]; - - // Look for patterns like IEE2022..., R001... (Adobe invoice number patterns) - const adobeMatch = delimited.match(/\b(IEE\d{10,})\b/i); - if (adobeMatch) return adobeMatch[1]; - const rInvoiceMatch = delimited.match(/\b(R\d{8,})\b/i); - if (rInvoiceMatch) return rInvoiceMatch[1]; - return null; - }; - - // Fallback invoice number extraction - const extractInvoiceNumber = (s: string): string => { - const delimited = extractDelimited(s); - if (delimited !== s.trim()) return delimited; - - let clean = s.replace(/\*\*/g, '').replace(/`/g, ''); - const patterns = [ - /\b([A-Z]{2,3}\d{10,})\b/i, - /\b([A-Z]\d{8,})\b/i, - /\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i, - /\b(\d{7,})\b/, - ]; - for (const pattern of patterns) { - const match = clean.match(pattern); - if (match) return match[1]; - } - return clean.replace(/[^A-Z0-9-]/gi, '').trim() || clean.trim(); - }; - - // Extract date with fallback - const extractDate = (s: string): string => { - const delimited = extractDelimited(s); - if (/^\d{4}-\d{2}-\d{2}$/.test(delimited)) return delimited; - - let clean = s.replace(/\*\*/g, '').replace(/`/g, ''); - const isoMatch = clean.match(/(\d{4}-\d{2}-\d{2})/); - if (isoMatch) return isoMatch[1]; - const dmmyMatch = clean.match(/(\d{1,2})[-\/]([A-Z]{3})[-\/](\d{4})/i); - if (dmmyMatch) { - const monthMap: Record = { - JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', - JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', - }; - return `${dmmyMatch[3]}-${monthMap[dmmyMatch[2].toUpperCase()] || '01'}-${dmmyMatch[1].padStart(2, '0')}`; - } - const dmyMatch = clean.match(/(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/); - if (dmyMatch) { - return `${dmyMatch[3]}-${dmyMatch[2].padStart(2, '0')}-${dmyMatch[1].padStart(2, '0')}`; - } - return ''; - }; - - // Extract currency - const extractCurrency = (s: string): string => { - const delimited = extractDelimited(s); - if (['EUR', 'USD', 'GBP'].includes(delimited.toUpperCase())) return delimited.toUpperCase(); - const upper = s.toUpperCase(); - if (upper.includes('EUR') || upper.includes('€')) return 'EUR'; - if (upper.includes('USD') || upper.includes('$')) return 'USD'; - if (upper.includes('GBP') || upper.includes('£')) return 'GBP'; - return 'EUR'; - }; - - // Extract vendor - const extractVendor = (s: string): string => { - const delimited = extractDelimited(s); - if (delimited !== s.trim()) return delimited; - let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim(); - if (clean.length < 50) return clean.replace(/[."]+$/, '').trim(); - const companyMatch = clean.match(/([A-Z][A-Za-z0-9\s&]+(?:Ltd|Limited|GmbH|Inc|BV|AG|SE|LLC|Co|Corp)[.]?)/i); - if (companyMatch) return companyMatch[1].trim(); - return clean; - }; - - // STRATEGY 1: Get invoice number - try multiple approaches - // 1. From list with type labels - // 2. From direct query - // 3. From pattern matching - const fromList = extractInvoiceFromList(allNumbers); - const fromDirect = extractInvoiceNumber(directInvoiceNum); - const fromFallback = extractInvoiceNumber(allNumbers); - - // Prefer direct query if it has digits, otherwise use list - const invoiceNumber = (fromDirect && /\d{6,}/.test(fromDirect)) ? fromDirect : - (fromList && /\d{4,}/.test(fromList)) ? fromList : - fromDirect || fromList || fromFallback; - console.log(` [Parsed] Invoice Number: "${invoiceNumber}" (list: ${fromList}, direct: ${fromDirect})`); - - // STRATEGY 3: Compare two date responses, pick the valid one - const date1 = extractDate(invoiceDate); - const date2 = extractDate(invoiceDateAlt); - const finalDate = date1 || date2; - if (date1 && date2 && date1 !== date2) { - console.log(` [Validate] Date mismatch: "${date1}" vs "${date2}" - using first`); } +} - // Parse amounts - let total = parseAmount(totalAmount); - let net = parseAmount(netAmount); - let vat = parseAmount(vatAmount); - - // STRATEGY 4: Cross-field validation for amounts - // If amounts seem wrong (e.g., 1690 instead of 1.69), try to fix - if (total > 10000 && net < 100) { - console.log(` [Validate] Total ${total} seems too high vs net ${net}, dividing by 100`); - total = total / 100; - } - if (net > 10000 && total < 100) { - console.log(` [Validate] Net ${net} seems too high vs total ${total}, dividing by 100`); - net = net / 100; - } - - // Check if Net + VAT ≈ Total - if (net > 0 && vat >= 0 && total > 0) { - const calculated = net + vat; - if (Math.abs(calculated - total) > 1) { - console.log(` [Validate] Math check: ${net} + ${vat} = ${calculated} ≠ ${total}`); - } - } +/** + * Parse JSON response into IInvoice + */ +function parseJsonToInvoice(response: string): IInvoice | null { + const parsed = extractJsonFromResponse(response); + if (!parsed) return null; return { - invoice_number: invoiceNumber, - invoice_date: finalDate, - vendor_name: extractVendor(vendor), - currency: extractCurrency(currency), - net_amount: net, - vat_amount: vat, - total_amount: total, + invoice_number: extractInvoiceNumber(String(parsed.invoice_number || '')), + invoice_date: extractDate(String(parsed.invoice_date || '')), + vendor_name: String(parsed.vendor_name || '').replace(/\*\*/g, '').replace(/`/g, '').trim(), + currency: extractCurrency(String(parsed.currency || '')), + net_amount: parseAmount(parsed.net_amount as string | number), + vat_amount: parseAmount(parsed.vat_amount as string | number), + total_amount: parseAmount(parsed.total_amount as string | number), + }; +} + +/** + * Compare two invoices for consensus (key fields must match) + */ +function invoicesMatch(a: IInvoice, b: IInvoice): boolean { + const numMatch = a.invoice_number.toLowerCase() === b.invoice_number.toLowerCase(); + const dateMatch = a.invoice_date === b.invoice_date; + const totalMatch = Math.abs(a.total_amount - b.total_amount) < 0.02; + return numMatch && dateMatch && totalMatch; +} + +/** + * Extract invoice data using consensus approach: + * 1. Pass 1: Fast JSON extraction + * 2. Pass 2: Confirm with thinking enabled + * 3. If mismatch: repeat until consensus or max 5 attempts + */ +async function extractInvoiceFromImages(images: string[]): Promise { + console.log(` [Vision] Processing ${images.length} page(s) with ${MODEL} (consensus)`); + + const MAX_ATTEMPTS = 5; + let attempt = 0; + + while (attempt < MAX_ATTEMPTS) { + attempt++; + console.log(` [Attempt ${attempt}/${MAX_ATTEMPTS}]`); + + // PASS 1: Fast JSON extraction + console.log(` [Pass 1] Fast extraction...`); + const fastResponse = await queryJsonFast(images); + const fastInvoice = parseJsonToInvoice(fastResponse); + + if (!fastInvoice) { + console.log(` [Pass 1] JSON parsing failed, retrying...`); + continue; + } + console.log(` [Pass 1] Result: ${fastInvoice.invoice_number} | ${fastInvoice.invoice_date} | ${fastInvoice.total_amount} ${fastInvoice.currency}`); + + // PASS 2: Confirm with thinking + console.log(` [Pass 2] Thinking confirmation...`); + const thinkResponse = await queryJsonWithThinking(images); + const thinkInvoice = parseJsonToInvoice(thinkResponse); + + if (!thinkInvoice) { + console.log(` [Pass 2] JSON parsing failed, retrying...`); + continue; + } + console.log(` [Pass 2] Result: ${thinkInvoice.invoice_number} | ${thinkInvoice.invoice_date} | ${thinkInvoice.total_amount} ${thinkInvoice.currency}`); + + // Check consensus + if (invoicesMatch(fastInvoice, thinkInvoice)) { + console.log(` [Consensus] MATCH - using result`); + return thinkInvoice; // Prefer thinking result + } + + console.log(` [Consensus] MISMATCH - repeating...`); + console.log(` Fast: ${fastInvoice.invoice_number} | ${fastInvoice.invoice_date} | ${fastInvoice.total_amount}`); + console.log(` Think: ${thinkInvoice.invoice_number} | ${thinkInvoice.invoice_date} | ${thinkInvoice.total_amount}`); + } + + // Max attempts reached - do one final thinking pass and use that + console.log(` [Final] Max attempts reached, using final thinking pass`); + const finalResponse = await queryJsonWithThinking(images); + const finalInvoice = parseJsonToInvoice(finalResponse); + + if (finalInvoice) { + console.log(` [Final] Result: ${finalInvoice.invoice_number} | ${finalInvoice.invoice_date} | ${finalInvoice.total_amount} ${finalInvoice.currency}`); + return finalInvoice; + } + + // Return empty invoice if all else fails + console.log(` [Final] All parsing failed, returning empty`); + return { + invoice_number: '', + invoice_date: '', + vendor_name: '', + currency: 'EUR', + net_amount: 0, + vat_amount: 0, + total_amount: 0, }; } @@ -485,9 +462,9 @@ tap.test('summary', async () => { const avgTimeSec = processingTimes.length > 0 ? totalTimeMs / processingTimes.length / 1000 : 0; console.log(`\n========================================`); - console.log(` Invoice Extraction Summary (MiniCPM)`); + console.log(` Invoice Extraction Summary (${MODEL})`); console.log(`========================================`); - console.log(` Method: Multi-query (no_think)`); + console.log(` Method: Consensus (fast + thinking)`); console.log(` Passed: ${passedCount}/${totalInvoices}`); console.log(` Failed: ${failedCount}/${totalInvoices}`); console.log(` Accuracy: ${accuracy.toFixed(1)}%`); diff --git a/test/test.invoices.ministral3.ts b/test/test.invoices.ministral3.ts deleted file mode 100644 index cf677f9..0000000 --- a/test/test.invoices.ministral3.ts +++ /dev/null @@ -1,334 +0,0 @@ -/** - * Invoice extraction using Ministral 3 Vision (Direct) - * - * NO PaddleOCR needed - Ministral 3 has built-in vision encoder: - * 1. Convert PDF to images - * 2. Send images directly to Ministral 3 via Ollama - * 3. Extract structured JSON with native schema support - * - * This is the simplest possible pipeline. - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensureMinistral3 } from './helpers/docker.js'; - -const OLLAMA_URL = 'http://localhost:11434'; -const VISION_MODEL = 'ministral-3:8b'; - -interface IInvoice { - invoice_number: string; - invoice_date: string; - vendor_name: string; - currency: string; - net_amount: number; - vat_amount: number; - total_amount: number; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - // High quality conversion: 300 DPI, max quality, sharpen for better OCR - execSync( - `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove -sharpen 0x1 "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Extract invoice data directly from images using Ministral 3 Vision - */ -async function extractInvoiceFromImages(images: string[]): Promise { - console.log(` [Vision] Processing ${images.length} page(s) with Ministral 3`); - - // JSON schema for structured output - const invoiceSchema = { - type: 'object', - properties: { - invoice_number: { type: 'string' }, - invoice_date: { type: 'string' }, - vendor_name: { type: 'string' }, - currency: { type: 'string' }, - net_amount: { type: 'number' }, - vat_amount: { type: 'number' }, - total_amount: { type: 'number' }, - }, - required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], - }; - - const prompt = `You are an expert invoice data extraction system. Carefully analyze this invoice document and extract the following fields with high precision. - -INVOICE NUMBER: -- Look for labels: "Invoice No", "Invoice #", "Invoice Number", "Rechnung Nr", "Rechnungsnummer", "Document No", "Bill No", "Reference" -- Usually alphanumeric, often starts with letters (e.g., R0014359508, INV-2024-001) -- Located near the top of the invoice - -INVOICE DATE: -- Look for labels: "Invoice Date", "Date", "Datum", "Rechnungsdatum", "Issue Date", "Bill Date" -- Convert ANY date format to YYYY-MM-DD (e.g., 14/10/2021 → 2021-10-14, Oct 14, 2021 → 2021-10-14) -- Usually near the invoice number - -VENDOR NAME: -- The company ISSUING the invoice (not the recipient) -- Found in letterhead, logo area, or header - typically the largest/most prominent company name -- Examples: "Hetzner Online GmbH", "Adobe Inc", "DigitalOcean LLC" - -CURRENCY: -- Detect from symbols: € = EUR, $ = USD, £ = GBP -- Or from text: "EUR", "USD", "GBP" -- Default to EUR if unclear - -AMOUNTS (Critical - read carefully!): -- total_amount: The FINAL amount due/payable - look for "Total", "Grand Total", "Amount Due", "Balance Due", "Gesamtbetrag", "Endbetrag" -- net_amount: Subtotal BEFORE tax - look for "Subtotal", "Net", "Netto", "excl. VAT" -- vat_amount: Tax amount - look for "VAT", "Tax", "MwSt", "USt", "19%", "20%" -- For multi-page invoices: the FINAL totals are usually on the LAST page - -Return ONLY valid JSON with the extracted values.`; - - const response = await fetch(`${OLLAMA_URL}/api/chat`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - model: VISION_MODEL, - messages: [ - { - role: 'user', - content: prompt, - images: images, // Send all page images - }, - ], - format: invoiceSchema, - stream: true, - options: { - num_predict: 1024, - temperature: 0.0, - }, - }), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.message?.content) { - fullText += json.message.content; - } - } catch { - // Skip invalid JSON lines - } - } - } - - // Parse JSON response - let jsonStr = fullText.trim(); - - if (jsonStr.startsWith('```json')) jsonStr = jsonStr.slice(7); - else if (jsonStr.startsWith('```')) jsonStr = jsonStr.slice(3); - if (jsonStr.endsWith('```')) jsonStr = jsonStr.slice(0, -3); - jsonStr = jsonStr.trim(); - - const startIdx = jsonStr.indexOf('{'); - const endIdx = jsonStr.lastIndexOf('}') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON found: ${fullText.substring(0, 200)}`); - } - - const parsed = JSON.parse(jsonStr.substring(startIdx, endIdx)); - - return { - invoice_number: parsed.invoice_number || null, - invoice_date: parsed.invoice_date || null, - vendor_name: parsed.vendor_name || null, - currency: parsed.currency || 'EUR', - net_amount: parseFloat(parsed.net_amount) || 0, - vat_amount: parseFloat(parsed.vat_amount) || 0, - total_amount: parseFloat(parsed.total_amount) || 0, - }; -} - - -/** - * Normalize date to YYYY-MM-DD - */ -function normalizeDate(dateStr: string | null): string { - if (!dateStr) return ''; - if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr; - - const monthMap: Record = { - JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', - JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', - }; - - let match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); - if (match) { - return `${match[3]}-${monthMap[match[2].toUpperCase()] || '01'}-${match[1].padStart(2, '0')}`; - } - - match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); - if (match) { - return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`; - } - - return dateStr; -} - -/** - * Compare extracted vs expected - */ -function compareInvoice(extracted: IInvoice, expected: IInvoice): { match: boolean; errors: string[] } { - const errors: string[] = []; - - const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - if (extNum !== expNum) { - errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); - } - - if (normalizeDate(extracted.invoice_date) !== normalizeDate(expected.invoice_date)) { - errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); - } - - if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { - errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); - } - - if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { - errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); - } - - return { match: errors.length === 0, errors }; -} - -/** - * Find test cases - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit/invoices'); - if (!fs.existsSync(testDir)) return []; - - const files = fs.readdirSync(testDir); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of files.filter((f) => f.endsWith('.pdf'))) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - - return testCases.sort((a, b) => a.name.localeCompare(b.name)); -} - -// Tests - -tap.test('setup: ensure Ministral 3 is running', async () => { - console.log('\n[Setup] Checking Ministral 3...\n'); - const ok = await ensureMinistral3(); - expect(ok).toBeTrue(); - console.log('\n[Setup] Ready!\n'); -}); - -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} invoice test cases (Ministral 3 Vision Direct)\n`); - -let passedCount = 0; -let failedCount = 0; -const times: number[] = []; - -for (const testCase of testCases) { - tap.test(`should extract invoice: ${testCase.name}`, async () => { - const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); - - const start = Date.now(); - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - - const extracted = await extractInvoiceFromImages(images); - console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`); - const elapsed = Date.now() - start; - times.push(elapsed); - - const result = compareInvoice(extracted, expected); - - if (result.match) { - passedCount++; - console.log(` Result: MATCH (${(elapsed / 1000).toFixed(1)}s)`); - } else { - failedCount++; - console.log(` Result: MISMATCH (${(elapsed / 1000).toFixed(1)}s)`); - result.errors.forEach((e) => console.log(` - ${e}`)); - } - - expect(result.match).toBeTrue(); - }); -} - -tap.test('summary', async () => { - const total = testCases.length; - const accuracy = total > 0 ? (passedCount / total) * 100 : 0; - const totalTime = times.reduce((a, b) => a + b, 0) / 1000; - const avgTime = times.length > 0 ? totalTime / times.length : 0; - - console.log(`\n======================================================`); - console.log(` Invoice Extraction Summary (Ministral 3 Vision)`); - console.log(`======================================================`); - console.log(` Method: Ministral 3 8B Vision (Direct)`); - console.log(` Passed: ${passedCount}/${total}`); - console.log(` Failed: ${failedCount}/${total}`); - console.log(` Accuracy: ${accuracy.toFixed(1)}%`); - console.log(`------------------------------------------------------`); - console.log(` Total time: ${totalTime.toFixed(1)}s`); - console.log(` Avg per inv: ${avgTime.toFixed(1)}s`); - console.log(`======================================================\n`); -}); - -export default tap.start(); diff --git a/test/test.invoices.paddleocr-vl.ts b/test/test.invoices.paddleocr-vl.ts deleted file mode 100644 index 00e8ebd..0000000 --- a/test/test.invoices.paddleocr-vl.ts +++ /dev/null @@ -1,490 +0,0 @@ -/** - * Invoice extraction test using PaddleOCR-VL Full Pipeline - * - * This tests the complete PaddleOCR-VL pipeline: - * 1. PP-DocLayoutV2 for layout detection - * 2. PaddleOCR-VL for recognition - * 3. Structured HTML output (semantic tags with proper tables) - * 4. Qwen2.5 extracts invoice fields from structured HTML - * - * HTML output is used instead of Markdown because: - * - tags are unambiguous (no parser variations) - * - LLMs are heavily trained on web/HTML data - * - Semantic tags (header, footer, section) provide clear structure - */ -import { tap, expect } from '@git.zone/tstest/tapbundle'; -import * as fs from 'fs'; -import * as path from 'path'; -import { execSync } from 'child_process'; -import * as os from 'os'; -import { ensurePaddleOcrVlFull, ensureQwen25 } from './helpers/docker.js'; - -const PADDLEOCR_VL_URL = 'http://localhost:8000'; -const OLLAMA_URL = 'http://localhost:11434'; -// Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused) -const TEXT_MODEL = 'qwen2.5:7b'; - -interface IInvoice { - invoice_number: string; - invoice_date: string; - vendor_name: string; - currency: string; - net_amount: number; - vat_amount: number; - total_amount: number; -} - -/** - * Convert PDF to PNG images using ImageMagick - */ -function convertPdfToImages(pdfPath: string): string[] { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); - const outputPattern = path.join(tempDir, 'page-%d.png'); - - try { - execSync( - `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, - { stdio: 'pipe' } - ); - - const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); - const images: string[] = []; - - for (const file of files) { - const imagePath = path.join(tempDir, file); - const imageData = fs.readFileSync(imagePath); - images.push(imageData.toString('base64')); - } - - return images; - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } -} - -/** - * Parse document using PaddleOCR-VL Full Pipeline (returns structured HTML) - */ -async function parseDocument(imageBase64: string): Promise { - const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - image: imageBase64, - output_format: 'html', - }), - }); - - if (!response.ok) { - const text = await response.text(); - throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); - } - - const data = await response.json(); - - if (!data.success) { - throw new Error(`PaddleOCR-VL error: ${data.error}`); - } - - return data.result?.html || ''; -} - -/** - * Extract invoice fields using simple direct prompt - * The OCR output has clearly labeled fields - just ask the LLM to read them - */ -async function extractInvoiceFromHtml(html: string): Promise { - // OCR output is already good - just truncate if too long - const truncated = html.length > 32000 ? html.slice(0, 32000) : html; - console.log(` [Extract] ${truncated.length} chars of HTML`); - - // JSON schema for structured output - const invoiceSchema = { - type: 'object', - properties: { - invoice_number: { type: 'string' }, - invoice_date: { type: 'string' }, - vendor_name: { type: 'string' }, - currency: { type: 'string' }, - net_amount: { type: 'number' }, - vat_amount: { type: 'number' }, - total_amount: { type: 'number' }, - }, - required: ['invoice_number', 'invoice_date', 'vendor_name', 'currency', 'net_amount', 'vat_amount', 'total_amount'], - }; - - // Simple, direct prompt - the OCR output already has labeled fields - const systemPrompt = `You read invoice HTML and extract labeled fields. Return JSON only.`; - - const userPrompt = `Extract from this invoice HTML: -- invoice_number: Find "Invoice no.", "Invoice #", "Invoice", "Rechnung", "Document No" and extract the value -- invoice_date: Find "Invoice date", "Date", "Datum" and convert to YYYY-MM-DD format -- vendor_name: The company name issuing the invoice (in header/letterhead) -- currency: EUR, USD, or GBP (look for € $ £ symbols or text) -- total_amount: Find "Total", "Grand Total", "Amount Due", "Gesamtbetrag" - the FINAL total amount -- net_amount: Amount before VAT/tax (Subtotal, Net) -- vat_amount: VAT/tax amount - -HTML: -${truncated} - -Return ONLY valid JSON: {"invoice_number":"...", "invoice_date":"YYYY-MM-DD", "vendor_name":"...", "currency":"EUR", "net_amount":0, "vat_amount":0, "total_amount":0}`; - - const response = await fetch(`${OLLAMA_URL}/api/chat`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - model: TEXT_MODEL, - messages: [ - { role: 'system', content: systemPrompt }, - { role: 'user', content: userPrompt }, - ], - format: invoiceSchema, - stream: true, - options: { num_predict: 512, temperature: 0.0 }, - }), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No response body'); - } - - const decoder = new TextDecoder(); - let fullText = ''; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - const lines = chunk.split('\n').filter((l) => l.trim()); - - for (const line of lines) { - try { - const json = JSON.parse(line); - if (json.message?.content) { - fullText += json.message.content; - } else if (json.response) { - fullText += json.response; - } - } catch { - // Skip invalid JSON lines - } - } - } - - // Extract JSON from response - let jsonStr = fullText.trim(); - - // Remove markdown code block if present - if (jsonStr.startsWith('```json')) { - jsonStr = jsonStr.slice(7); - } else if (jsonStr.startsWith('```')) { - jsonStr = jsonStr.slice(3); - } - if (jsonStr.endsWith('```')) { - jsonStr = jsonStr.slice(0, -3); - } - jsonStr = jsonStr.trim(); - - // Find JSON object boundaries - const startIdx = jsonStr.indexOf('{'); - const endIdx = jsonStr.lastIndexOf('}') + 1; - - if (startIdx < 0 || endIdx <= startIdx) { - throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); - } - - jsonStr = jsonStr.substring(startIdx, endIdx); - - let parsed; - try { - parsed = JSON.parse(jsonStr); - } catch (e) { - throw new Error(`Invalid JSON: ${jsonStr.substring(0, 200)}`); - } - - // Normalize response to expected format - return { - invoice_number: parsed.invoice_number || null, - invoice_date: parsed.invoice_date || null, - vendor_name: parsed.vendor_name || null, - currency: parsed.currency || 'EUR', - net_amount: parseFloat(parsed.net_amount) || 0, - vat_amount: parseFloat(parsed.vat_amount) || 0, - total_amount: parseFloat(parsed.total_amount) || 0, - }; -} - -/** - * Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only) - * Processes ALL pages and concatenates HTML for multi-page invoice support - */ -async function extractOnce(images: string[], passNum: number): Promise { - // Parse ALL pages and concatenate HTML with page markers - const htmlParts: string[] = []; - - for (let i = 0; i < images.length; i++) { - const pageHtml = await parseDocument(images[i]); - // Add page marker for context - htmlParts.push(`\n${pageHtml}`); - } - - const fullHtml = htmlParts.join('\n\n'); - console.log(` [Parse] Got ${fullHtml.split('\n').length} lines from ${images.length} page(s)`); - - // Extract invoice fields from HTML using text-only model (no images) - return extractInvoiceFromHtml(fullHtml); -} - -/** - * Create a hash of invoice for comparison (using key fields) - */ -function hashInvoice(invoice: IInvoice): string { - // Ensure total_amount is a number - const amount = typeof invoice.total_amount === 'number' - ? invoice.total_amount.toFixed(2) - : String(invoice.total_amount || 0); - return `${invoice.invoice_number}|${invoice.invoice_date}|${amount}`; -} - -/** - * Extract with consensus voting - */ -async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { - const results: Array<{ invoice: IInvoice; hash: string }> = []; - const hashCounts: Map = new Map(); - - const addResult = (invoice: IInvoice, passLabel: string): number => { - const hash = hashInvoice(invoice); - results.push({ invoice, hash }); - hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); - console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); - return hashCounts.get(hash)!; - }; - - for (let pass = 1; pass <= maxPasses; pass++) { - try { - const invoice = await extractOnce(images, pass); - const count = addResult(invoice, `Pass ${pass}`); - - if (count >= 2) { - console.log(` [Consensus] Reached after ${pass} passes`); - return invoice; - } - } catch (err) { - console.log(` [Pass ${pass}] Error: ${err}`); - } - } - - // No consensus reached - return the most common result - let bestHash = ''; - let bestCount = 0; - for (const [hash, count] of hashCounts) { - if (count > bestCount) { - bestCount = count; - bestHash = hash; - } - } - - if (!bestHash) { - throw new Error(`No valid results for ${invoiceName}`); - } - - const best = results.find((r) => r.hash === bestHash)!; - console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); - return best.invoice; -} - -/** - * Normalize date to YYYY-MM-DD format - */ -function normalizeDate(dateStr: string | null): string { - if (!dateStr) return ''; - - // Already in correct format - if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) { - return dateStr; - } - - // Handle DD-MMM-YYYY format (e.g., "28-JUN-2022") - const monthMap: Record = { - JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06', - JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12', - }; - - const match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i); - if (match) { - const day = match[1].padStart(2, '0'); - const month = monthMap[match[2].toUpperCase()] || '01'; - const year = match[3]; - return `${year}-${month}-${day}`; - } - - // Handle DD/MM/YYYY or DD.MM.YYYY - const match2 = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/); - if (match2) { - const day = match2[1].padStart(2, '0'); - const month = match2[2].padStart(2, '0'); - const year = match2[3]; - return `${year}-${month}-${day}`; - } - - return dateStr; -} - -/** - * Compare extracted invoice against expected - */ -function compareInvoice( - extracted: IInvoice, - expected: IInvoice -): { match: boolean; errors: string[] } { - const errors: string[] = []; - - // Compare invoice number (normalize by removing spaces and case) - const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; - if (extNum !== expNum) { - errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); - } - - // Compare date (normalize format first) - const extDate = normalizeDate(extracted.invoice_date); - const expDate = normalizeDate(expected.invoice_date); - if (extDate !== expDate) { - errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); - } - - // Compare total amount (with tolerance) - if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { - errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); - } - - // Compare currency - if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { - errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); - } - - return { match: errors.length === 0, errors }; -} - -/** - * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ - */ -function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { - const testDir = path.join(process.cwd(), '.nogit/invoices'); - if (!fs.existsSync(testDir)) { - return []; - } - - const files = fs.readdirSync(testDir); - const pdfFiles = files.filter((f) => f.endsWith('.pdf')); - const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; - - for (const pdf of pdfFiles) { - const baseName = pdf.replace('.pdf', ''); - const jsonFile = `${baseName}.json`; - if (files.includes(jsonFile)) { - testCases.push({ - name: baseName, - pdfPath: path.join(testDir, pdf), - jsonPath: path.join(testDir, jsonFile), - }); - } - } - - // Sort alphabetically - testCases.sort((a, b) => a.name.localeCompare(b.name)); - - return testCases; -} - -// Tests - -tap.test('setup: ensure Docker containers are running', async () => { - console.log('\n[Setup] Checking Docker containers...\n'); - - // Ensure PaddleOCR-VL Full Pipeline is running - const paddleOk = await ensurePaddleOcrVlFull(); - expect(paddleOk).toBeTrue(); - - // Ensure Qwen2.5 is available (for text-only JSON extraction) - const qwenOk = await ensureQwen25(); - expect(qwenOk).toBeTrue(); - - console.log('\n[Setup] All containers ready!\n'); -}); - -// Dynamic test for each PDF/JSON pair -const testCases = findTestCases(); -console.log(`\nFound ${testCases.length} invoice test cases (PaddleOCR-VL Full Pipeline)\n`); - -let passedCount = 0; -let failedCount = 0; -const processingTimes: number[] = []; - -for (const testCase of testCases) { - tap.test(`should extract invoice: ${testCase.name}`, async () => { - // Load expected data - const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); - console.log(`\n=== ${testCase.name} ===`); - console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); - - const startTime = Date.now(); - - // Convert PDF to images - const images = convertPdfToImages(testCase.pdfPath); - console.log(` Pages: ${images.length}`); - - // Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM) - const extracted = await extractWithConsensus(images, testCase.name); - - const endTime = Date.now(); - const elapsedMs = endTime - startTime; - processingTimes.push(elapsedMs); - - // Compare results - const result = compareInvoice(extracted, expected); - - if (result.match) { - passedCount++; - console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); - } else { - failedCount++; - console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); - result.errors.forEach((e) => console.log(` - ${e}`)); - } - - // Assert match - expect(result.match).toBeTrue(); - }); -} - -tap.test('summary', async () => { - const totalInvoices = testCases.length; - const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; - const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); - const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; - const avgTimeSec = avgTimeMs / 1000; - const totalTimeSec = totalTimeMs / 1000; - - console.log(`\n======================================================`); - console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`); - console.log(`======================================================`); - console.log(` Method: PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only)`); - console.log(` Passed: ${passedCount}/${totalInvoices}`); - console.log(` Failed: ${failedCount}/${totalInvoices}`); - console.log(` Accuracy: ${accuracy.toFixed(1)}%`); - console.log(`------------------------------------------------------`); - console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); - console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); - console.log(`======================================================\n`); -}); - -export default tap.start();