diff --git a/Dockerfile_paddleocr_vl_full b/Dockerfile_paddleocr_vl_full new file mode 100644 index 0000000..81f613f --- /dev/null +++ b/Dockerfile_paddleocr_vl_full @@ -0,0 +1,90 @@ +# PaddleOCR-VL Full Pipeline (PP-DocLayoutV2 + PaddleOCR-VL + Structured Output) +# Self-contained GPU image with complete document parsing pipeline +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 + +LABEL maintainer="Task Venture Capital GmbH " +LABEL description="PaddleOCR-VL Full Pipeline - Layout Detection + VL Recognition + JSON/Markdown Output" +LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" + +# Environment configuration +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV HF_HOME=/root/.cache/huggingface +ENV PADDLEOCR_HOME=/root/.paddleocr +ENV SERVER_PORT=8000 +ENV SERVER_HOST=0.0.0.0 +ENV VLM_PORT=8080 + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + python3-pip \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libgomp1 \ + libsm6 \ + libxext6 \ + libxrender1 \ + curl \ + git \ + wget \ + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 + +# Create and activate virtual environment +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Upgrade pip +RUN pip install --no-cache-dir --upgrade pip setuptools wheel + +# Install PaddlePaddle GPU (CUDA 12.x) +RUN pip install --no-cache-dir \ + paddlepaddle-gpu==3.2.1 \ + --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ + +# Install PaddleOCR with doc-parser (includes PP-DocLayoutV2) +RUN pip install --no-cache-dir \ + "paddleocr[doc-parser]" \ + safetensors + +# Install PyTorch with CUDA support +RUN pip install --no-cache-dir \ + torch==2.5.1 \ + torchvision \ + --index-url https://download.pytorch.org/whl/cu124 + +# Install transformers for PaddleOCR-VL inference (no vLLM - use local inference) +# PaddleOCR-VL requires transformers>=4.55.0 for use_kernel_forward_from_hub +RUN pip install --no-cache-dir \ + transformers>=4.55.0 \ + accelerate \ + hf-kernels + +# Install our API server dependencies +RUN pip install --no-cache-dir \ + fastapi \ + uvicorn[standard] \ + python-multipart \ + httpx \ + pillow + +# Copy server files +COPY image_support_files/paddleocr_vl_full_server.py /app/server.py +COPY image_support_files/paddleocr_vl_full_entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Expose ports (8000 = API, 8080 = internal VLM server) +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/changelog.md b/changelog.md index f990640..2c9229c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-17 - 1.6.0 - feat(paddleocr-vl) +add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers + +- Add Dockerfile_paddleocr_vl_full and entrypoint script to build a GPU-enabled image with PP-DocLayoutV2 + PaddleOCR-VL and a FastAPI server +- Introduce image_support_files/paddleocr_vl_full_server.py implementing the full pipeline API (/parse, OpenAI-compatible /v1/chat/completions) and a /formats endpoint +- Improve image handling: decode_image supports data URLs, HTTP(S), raw base64 and file paths; add optimize_image_resolution to auto-scale images into the recommended 1080-2048px range +- Add test helpers (test/helpers/docker.ts) to build/start/health-check Docker images and new ensurePaddleOcrVlFull workflow +- Add comprehensive integration tests for bank statements and invoices (MiniCPM and PaddleOCR-VL variants) and update tests to ensure required containers are running before tests +- Switch MiniCPM model references to 'minicpm-v:latest' and increase health/timeout expectations for the full pipeline + ## 2026-01-17 - 1.5.0 - feat(paddleocr-vl) add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests diff --git a/image_support_files/paddleocr_vl_full_entrypoint.sh b/image_support_files/paddleocr_vl_full_entrypoint.sh new file mode 100644 index 0000000..1a75ed0 --- /dev/null +++ b/image_support_files/paddleocr_vl_full_entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +echo "Starting PaddleOCR-VL Full Pipeline Server (Transformers backend)..." + +# Environment +SERVER_PORT=${SERVER_PORT:-8000} +SERVER_HOST=${SERVER_HOST:-0.0.0.0} + +# Start our API server directly (no vLLM - uses local transformers inference) +echo "Starting API server on port $SERVER_PORT..." +exec python /app/server.py diff --git a/image_support_files/paddleocr_vl_full_server.py b/image_support_files/paddleocr_vl_full_server.py new file mode 100644 index 0000000..484547e --- /dev/null +++ b/image_support_files/paddleocr_vl_full_server.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +""" +PaddleOCR-VL Full Pipeline API Server (Transformers backend) + +Provides REST API for document parsing using: +- PP-DocLayoutV2 for layout detection +- PaddleOCR-VL (transformers) for recognition +- Structured JSON/Markdown output +""" + +import os +import io +import base64 +import logging +import tempfile +import time +import json +from typing import Optional, List, Union +from pathlib import Path + +from fastapi import FastAPI, HTTPException, UploadFile, File, Form +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from PIL import Image +import torch + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Environment configuration +SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0') +SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000')) +MODEL_NAME = "PaddlePaddle/PaddleOCR-VL" + +# Device configuration +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +logger.info(f"Using device: {DEVICE}") + +# Task prompts +TASK_PROMPTS = { + "ocr": "OCR:", + "table": "Table Recognition:", + "formula": "Formula Recognition:", + "chart": "Chart Recognition:", +} + +# Initialize FastAPI app +app = FastAPI( + title="PaddleOCR-VL Full Pipeline Server", + description="Document parsing with PP-DocLayoutV2 + PaddleOCR-VL (transformers)", + version="1.0.0" +) + +# Global model instances +vl_model = None +vl_processor = None +layout_model = None + + +def load_vl_model(): + """Load the PaddleOCR-VL model for element recognition""" + global vl_model, vl_processor + + if vl_model is not None: + return + + logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}") + from transformers import AutoModelForCausalLM, AutoProcessor + + vl_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + + if DEVICE == "cuda": + vl_model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + ).to(DEVICE).eval() + else: + vl_model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + trust_remote_code=True, + torch_dtype=torch.float32, + low_cpu_mem_usage=True, + ).eval() + + logger.info("PaddleOCR-VL model loaded successfully") + + +def load_layout_model(): + """Load the LayoutDetection model for layout detection""" + global layout_model + + if layout_model is not None: + return + + try: + logger.info("Loading LayoutDetection model (PP-DocLayout_plus-L)...") + from paddleocr import LayoutDetection + + layout_model = LayoutDetection() + logger.info("LayoutDetection model loaded successfully") + except Exception as e: + logger.warning(f"Could not load LayoutDetection: {e}") + logger.info("Falling back to VL-only mode (no layout detection)") + + +def recognize_element(image: Image.Image, task: str = "ocr") -> str: + """Recognize a single element using PaddleOCR-VL""" + load_vl_model() + + prompt = TASK_PROMPTS.get(task, TASK_PROMPTS["ocr"]) + + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prompt}, + ] + } + ] + + inputs = vl_processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt" + ) + + if DEVICE == "cuda": + inputs = {k: v.to(DEVICE) for k, v in inputs.items()} + + with torch.inference_mode(): + outputs = vl_model.generate( + **inputs, + max_new_tokens=4096, + do_sample=False, + use_cache=True + ) + + response = vl_processor.batch_decode(outputs, skip_special_tokens=True)[0] + + # Extract only the assistant's response content + # The response format is: "User: \nAssistant: " + # We want to extract just the content after "Assistant:" + if "Assistant:" in response: + parts = response.split("Assistant:") + if len(parts) > 1: + response = parts[-1].strip() + elif "assistant:" in response.lower(): + # Case-insensitive fallback + import re + match = re.split(r'[Aa]ssistant:', response) + if len(match) > 1: + response = match[-1].strip() + + return response + + +def detect_layout(image: Image.Image) -> List[dict]: + """Detect layout regions in the image""" + load_layout_model() + + if layout_model is None: + # No layout model - return a single region covering the whole image + return [{ + "type": "text", + "bbox": [0, 0, image.width, image.height], + "score": 1.0 + }] + + # Save image to temp file + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + image.save(tmp.name, "PNG") + tmp_path = tmp.name + + try: + results = layout_model.predict(tmp_path) + regions = [] + + for res in results: + # LayoutDetection returns boxes in 'boxes' key + for box in res.get("boxes", []): + coord = box.get("coordinate", [0, 0, image.width, image.height]) + # Convert numpy floats to regular floats + bbox = [float(c) for c in coord] + regions.append({ + "type": box.get("label", "text"), + "bbox": bbox, + "score": float(box.get("score", 1.0)) + }) + + # Sort regions by vertical position (top to bottom) + regions.sort(key=lambda r: r["bbox"][1]) + + return regions if regions else [{ + "type": "text", + "bbox": [0, 0, image.width, image.height], + "score": 1.0 + }] + + finally: + os.unlink(tmp_path) + + +def process_document(image: Image.Image) -> dict: + """Process a document through the full pipeline""" + logger.info(f"Processing document: {image.size}") + + # Step 1: Detect layout + regions = detect_layout(image) + logger.info(f"Detected {len(regions)} layout regions") + + # Step 2: Recognize each region + blocks = [] + for i, region in enumerate(regions): + region_type = region["type"].lower() + bbox = region["bbox"] + + # Crop region from image + x1, y1, x2, y2 = [int(c) for c in bbox] + region_image = image.crop((x1, y1, x2, y2)) + + # Determine task based on region type + if "table" in region_type: + task = "table" + elif "formula" in region_type or "math" in region_type: + task = "formula" + elif "chart" in region_type or "figure" in region_type: + task = "chart" + else: + task = "ocr" + + # Recognize the region + try: + content = recognize_element(region_image, task) + blocks.append({ + "index": i, + "type": region_type, + "bbox": bbox, + "content": content, + "task": task + }) + logger.info(f" Region {i} ({region_type}): {len(content)} chars") + except Exception as e: + logger.error(f" Region {i} error: {e}") + blocks.append({ + "index": i, + "type": region_type, + "bbox": bbox, + "content": "", + "error": str(e) + }) + + return {"blocks": blocks, "image_size": list(image.size)} + + +def result_to_markdown(result: dict) -> str: + """Convert result to Markdown format""" + lines = [] + + for block in result.get("blocks", []): + block_type = block.get("type", "text") + content = block.get("content", "") + + if "table" in block_type.lower(): + lines.append(f"\n{content}\n") + elif "formula" in block_type.lower(): + lines.append(f"\n$$\n{content}\n$$\n") + else: + lines.append(content) + + return "\n\n".join(lines) + + +# Request/Response models +class ParseRequest(BaseModel): + image: str # base64 encoded image + output_format: Optional[str] = "json" + + +class ParseResponse(BaseModel): + success: bool + format: str + result: Union[dict, str] + processing_time: float + error: Optional[str] = None + + +def decode_image(image_source: str) -> Image.Image: + """Decode image from base64 or data URL""" + if image_source.startswith("data:"): + header, data = image_source.split(",", 1) + image_data = base64.b64decode(data) + else: + image_data = base64.b64decode(image_source) + + return Image.open(io.BytesIO(image_data)).convert("RGB") + + +@app.on_event("startup") +async def startup_event(): + """Pre-load models on startup""" + logger.info("Starting PaddleOCR-VL Full Pipeline Server...") + try: + load_vl_model() + load_layout_model() + logger.info("Models loaded successfully") + except Exception as e: + logger.error(f"Failed to pre-load models: {e}") + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return { + "status": "healthy" if vl_model is not None else "loading", + "service": "PaddleOCR-VL Full Pipeline (Transformers)", + "device": DEVICE, + "vl_model_loaded": vl_model is not None, + "layout_model_loaded": layout_model is not None + } + + +@app.get("/formats") +async def supported_formats(): + """List supported output formats""" + return { + "output_formats": ["json", "markdown"], + "image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"], + "capabilities": [ + "Layout detection (PP-DocLayoutV2)", + "Text recognition (OCR)", + "Table recognition", + "Formula recognition (LaTeX)", + "Chart recognition", + "Multi-language support (109 languages)" + ] + } + + +@app.post("/parse", response_model=ParseResponse) +async def parse_document_endpoint(request: ParseRequest): + """Parse a document image and return structured output""" + try: + start_time = time.time() + + image = decode_image(request.image) + result = process_document(image) + + if request.output_format == "markdown": + markdown = result_to_markdown(result) + output = {"markdown": markdown} + else: + output = result + + elapsed = time.time() - start_time + logger.info(f"Processing complete in {elapsed:.2f}s") + + return ParseResponse( + success=True, + format=request.output_format, + result=output, + processing_time=elapsed + ) + + except Exception as e: + logger.error(f"Error processing document: {e}", exc_info=True) + return ParseResponse( + success=False, + format=request.output_format, + result={}, + processing_time=0, + error=str(e) + ) + + +@app.post("/v1/chat/completions") +async def chat_completions(request: dict): + """OpenAI-compatible chat completions endpoint""" + try: + messages = request.get("messages", []) + output_format = request.get("output_format", "json") + + # Find user message with image + image = None + for msg in reversed(messages): + if msg.get("role") == "user": + content = msg.get("content", []) + if isinstance(content, list): + for item in content: + if item.get("type") == "image_url": + url = item.get("image_url", {}).get("url", "") + image = decode_image(url) + break + break + + if image is None: + raise HTTPException(status_code=400, detail="No image provided") + + start_time = time.time() + result = process_document(image) + + if output_format == "markdown": + content = result_to_markdown(result) + else: + content = json.dumps(result, ensure_ascii=False, indent=2) + + elapsed = time.time() - start_time + + return { + "id": f"chatcmpl-{int(time.time()*1000)}", + "object": "chat.completion", + "created": int(time.time()), + "model": "paddleocr-vl-full", + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": content}, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": 100, + "completion_tokens": len(content) // 4, + "total_tokens": 100 + len(content) // 4 + }, + "processing_time": elapsed + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in chat completions: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT) diff --git a/image_support_files/paddleocr_vl_server.py b/image_support_files/paddleocr_vl_server.py index 13ba044..8a64b2b 100644 --- a/image_support_files/paddleocr_vl_server.py +++ b/image_support_files/paddleocr_vl_server.py @@ -136,27 +136,82 @@ def load_model(): logger.info("PaddleOCR-VL model loaded successfully") -def decode_image(image_source: str) -> Image.Image: - """Decode image from URL or base64""" +def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image: + """ + Optimize image resolution for PaddleOCR-VL. + + Best results are achieved with images in the 1080p-2K range. + - Images larger than max_size are scaled down + - Very small images are scaled up to min_size + """ + width, height = image.size + max_dim = max(width, height) + min_dim = min(width, height) + + # Scale down if too large (4K+ images often miss text) + if max_dim > max_size: + scale = max_size / max_dim + new_width = int(width * scale) + new_height = int(height * scale) + logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}") + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + # Scale up if too small + elif max_dim < min_size and min_dim < min_size: + scale = min_size / max_dim + new_width = int(width * scale) + new_height = int(height * scale) + logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}") + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + else: + logger.info(f"Image size {width}x{height} is optimal, no scaling needed") + + return image + + +def decode_image(image_source: str, optimize: bool = True) -> Image.Image: + """ + Decode image from various sources. + + Supported formats: + - Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,... + - HTTP/HTTPS URL: https://example.com/image.png + - Raw base64 string + - Local file path + + Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF + """ + image = None + if image_source.startswith("data:"): - # Base64 encoded image + # Base64 encoded image with MIME type header + # Supports: data:image/png;base64,... data:image/jpeg;base64,... etc. header, data = image_source.split(",", 1) image_data = base64.b64decode(data) - return Image.open(io.BytesIO(image_data)).convert("RGB") + image = Image.open(io.BytesIO(image_data)).convert("RGB") + logger.debug(f"Decoded base64 image with header: {header}") elif image_source.startswith("http://") or image_source.startswith("https://"): # URL - fetch image import httpx response = httpx.get(image_source, timeout=30.0) response.raise_for_status() - return Image.open(io.BytesIO(response.content)).convert("RGB") + image = Image.open(io.BytesIO(response.content)).convert("RGB") + logger.debug(f"Fetched image from URL: {image_source[:50]}...") else: # Assume it's a file path or raw base64 try: image_data = base64.b64decode(image_source) - return Image.open(io.BytesIO(image_data)).convert("RGB") + image = Image.open(io.BytesIO(image_data)).convert("RGB") + logger.debug("Decoded raw base64 image") except: # Try as file path - return Image.open(image_source).convert("RGB") + image = Image.open(image_source).convert("RGB") + logger.debug(f"Loaded image from file: {image_source}") + + # Optimize resolution for best OCR results + if optimize: + image = optimize_image_resolution(image) + + return image def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple: @@ -242,6 +297,45 @@ async def health_check(): ) +@app.get("/formats") +async def supported_formats(): + """List supported image formats and input methods""" + return { + "image_formats": { + "supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"], + "recommended": ["PNG", "JPEG"], + "mime_types": [ + "image/png", + "image/jpeg", + "image/webp", + "image/bmp", + "image/gif", + "image/tiff" + ] + }, + "input_methods": { + "base64_data_url": { + "description": "Base64 encoded image with MIME type header", + "example": "data:image/png;base64,iVBORw0KGgo..." + }, + "http_url": { + "description": "Direct HTTP/HTTPS URL to image", + "example": "https://example.com/image.png" + }, + "raw_base64": { + "description": "Raw base64 string without header", + "example": "iVBORw0KGgo..." + } + }, + "resolution": { + "optimal_range": "1080p to 2K (1080-2048 pixels on longest side)", + "auto_scaling": True, + "note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy." + }, + "task_prompts": TASK_PROMPTS + } + + @app.get("/v1/models") async def list_models(): """List available models (OpenAI-compatible)""" diff --git a/test/helpers/docker.ts b/test/helpers/docker.ts new file mode 100644 index 0000000..fc67fb2 --- /dev/null +++ b/test/helpers/docker.ts @@ -0,0 +1,297 @@ +import { execSync } from 'child_process'; + +// Project container names (only manage these) +const PROJECT_CONTAINERS = [ + 'paddleocr-vl-test', + 'paddleocr-vl-gpu-test', + 'paddleocr-vl-cpu-test', + 'paddleocr-vl-full-test', + 'minicpm-test', +]; + +// Image configurations +export interface IImageConfig { + name: string; + dockerfile: string; + buildContext: string; + containerName: string; + ports: string[]; + volumes?: string[]; + gpus?: boolean; + healthEndpoint?: string; + healthTimeout?: number; +} + +export const IMAGES = { + paddleocrVlGpu: { + name: 'paddleocr-vl-gpu', + dockerfile: 'Dockerfile_paddleocr_vl_gpu', + buildContext: '.', + containerName: 'paddleocr-vl-test', + ports: ['8000:8000'], + volumes: ['ht-huggingface-cache:/root/.cache/huggingface'], + gpus: true, + healthEndpoint: 'http://localhost:8000/health', + healthTimeout: 300000, // 5 minutes for model loading + } as IImageConfig, + + paddleocrVlCpu: { + name: 'paddleocr-vl-cpu', + dockerfile: 'Dockerfile_paddleocr_vl_cpu', + buildContext: '.', + containerName: 'paddleocr-vl-test', + ports: ['8000:8000'], + volumes: ['ht-huggingface-cache:/root/.cache/huggingface'], + gpus: false, + healthEndpoint: 'http://localhost:8000/health', + healthTimeout: 300000, + } as IImageConfig, + + minicpm: { + name: 'minicpm45v', + dockerfile: 'Dockerfile_minicpm45v', + buildContext: '.', + containerName: 'minicpm-test', + ports: ['11434:11434'], + volumes: ['ht-ollama-models:/root/.ollama'], + gpus: true, + healthEndpoint: 'http://localhost:11434/api/tags', + healthTimeout: 120000, + } as IImageConfig, + + // Full PaddleOCR-VL pipeline with PP-DocLayoutV2 + structured JSON output + paddleocrVlFull: { + name: 'paddleocr-vl-full', + dockerfile: 'Dockerfile_paddleocr_vl_full', + buildContext: '.', + containerName: 'paddleocr-vl-full-test', + ports: ['8000:8000'], + volumes: [ + 'ht-huggingface-cache:/root/.cache/huggingface', + 'ht-paddleocr-cache:/root/.paddleocr', + ], + gpus: true, + healthEndpoint: 'http://localhost:8000/health', + healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2) + } as IImageConfig, +}; + +/** + * Execute a shell command and return output + */ +function exec(command: string, silent = false): string { + try { + return execSync(command, { + encoding: 'utf-8', + stdio: silent ? 'pipe' : 'inherit', + }); + } catch (err: unknown) { + if (silent) return ''; + throw err; + } +} + +/** + * Check if a Docker image exists locally + */ +export function imageExists(imageName: string): boolean { + const result = exec(`docker images -q ${imageName}`, true); + return result.trim().length > 0; +} + +/** + * Check if a container is running + */ +export function isContainerRunning(containerName: string): boolean { + const result = exec(`docker ps --filter "name=^${containerName}$" --format "{{.Names}}"`, true); + return result.trim() === containerName; +} + +/** + * Check if a container exists (running or stopped) + */ +export function containerExists(containerName: string): boolean { + const result = exec(`docker ps -a --filter "name=^${containerName}$" --format "{{.Names}}"`, true); + return result.trim() === containerName; +} + +/** + * Stop and remove a container + */ +export function removeContainer(containerName: string): void { + if (containerExists(containerName)) { + console.log(`[Docker] Removing container: ${containerName}`); + exec(`docker rm -f ${containerName}`, true); + } +} + +/** + * Stop all project containers that conflict with the required one + */ +export function stopConflictingContainers(requiredContainer: string, requiredPort: string): void { + // Stop project containers using the same port + for (const container of PROJECT_CONTAINERS) { + if (container === requiredContainer) continue; + + if (isContainerRunning(container)) { + // Check if this container uses the same port + const ports = exec(`docker port ${container} 2>/dev/null || true`, true); + if (ports.includes(requiredPort.split(':')[0])) { + console.log(`[Docker] Stopping conflicting container: ${container}`); + exec(`docker stop ${container}`, true); + } + } + } +} + +/** + * Build a Docker image + */ +export function buildImage(config: IImageConfig): void { + console.log(`[Docker] Building image: ${config.name}`); + const cmd = `docker build --load -f ${config.dockerfile} -t ${config.name} ${config.buildContext}`; + exec(cmd); +} + +/** + * Start a container from an image + */ +export function startContainer(config: IImageConfig): void { + // Remove existing container if it exists + removeContainer(config.containerName); + + console.log(`[Docker] Starting container: ${config.containerName}`); + + const portArgs = config.ports.map((p) => `-p ${p}`).join(' '); + const volumeArgs = config.volumes?.map((v) => `-v ${v}`).join(' ') || ''; + const gpuArgs = config.gpus ? '--gpus all' : ''; + + const cmd = `docker run -d --name ${config.containerName} ${gpuArgs} ${portArgs} ${volumeArgs} ${config.name}`; + exec(cmd); +} + +/** + * Wait for a container to become healthy + */ +export async function waitForHealth( + endpoint: string, + timeoutMs: number = 120000, + intervalMs: number = 5000 +): Promise { + const startTime = Date.now(); + console.log(`[Docker] Waiting for health: ${endpoint}`); + + while (Date.now() - startTime < timeoutMs) { + try { + const response = await fetch(endpoint, { + method: 'GET', + signal: AbortSignal.timeout(5000), + }); + if (response.ok) { + console.log(`[Docker] Service healthy!`); + return true; + } + } catch { + // Service not ready yet + } + + const elapsed = Math.round((Date.now() - startTime) / 1000); + console.log(`[Docker] Waiting... (${elapsed}s)`); + await new Promise((resolve) => setTimeout(resolve, intervalMs)); + } + + console.log(`[Docker] Health check timeout after ${timeoutMs / 1000}s`); + return false; +} + +/** + * Ensure a service is running and healthy + * - Builds image if missing + * - Stops conflicting project containers + * - Starts container if not running + * - Waits for health check + */ +export async function ensureService(config: IImageConfig): Promise { + console.log(`\n[Docker] Ensuring service: ${config.name}`); + + // Build image if it doesn't exist + if (!imageExists(config.name)) { + console.log(`[Docker] Image not found, building...`); + buildImage(config); + } + + // Stop conflicting containers on the same port + const mainPort = config.ports[0]; + stopConflictingContainers(config.containerName, mainPort); + + // Start container if not running + if (!isContainerRunning(config.containerName)) { + startContainer(config); + } else { + console.log(`[Docker] Container already running: ${config.containerName}`); + } + + // Wait for health + if (config.healthEndpoint) { + return waitForHealth(config.healthEndpoint, config.healthTimeout); + } + + return true; +} + +/** + * Ensure PaddleOCR-VL GPU service is running + */ +export async function ensurePaddleOcrVlGpu(): Promise { + return ensureService(IMAGES.paddleocrVlGpu); +} + +/** + * Ensure PaddleOCR-VL CPU service is running + */ +export async function ensurePaddleOcrVlCpu(): Promise { + return ensureService(IMAGES.paddleocrVlCpu); +} + +/** + * Ensure MiniCPM service is running + */ +export async function ensureMiniCpm(): Promise { + return ensureService(IMAGES.minicpm); +} + +/** + * Check if GPU is available + */ +export function isGpuAvailable(): boolean { + try { + const result = exec('nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null', true); + return result.trim().length > 0; + } catch { + return false; + } +} + +/** + * Ensure PaddleOCR-VL service (auto-detect GPU/CPU) + */ +export async function ensurePaddleOcrVl(): Promise { + if (isGpuAvailable()) { + console.log('[Docker] GPU detected, using GPU image'); + return ensurePaddleOcrVlGpu(); + } else { + console.log('[Docker] No GPU detected, using CPU image'); + return ensurePaddleOcrVlCpu(); + } +} + +/** + * Ensure PaddleOCR-VL Full Pipeline service (PP-DocLayoutV2 + structured output) + * This is the recommended service for production use - outputs structured JSON/Markdown + */ +export async function ensurePaddleOcrVlFull(): Promise { + if (!isGpuAvailable()) { + console.log('[Docker] WARNING: Full pipeline requires GPU, but none detected'); + } + return ensureService(IMAGES.paddleocrVlFull); +} diff --git a/test/test.bankstatements.combined.ts b/test/test.bankstatements.combined.ts index 5025ce2..4a238e6 100644 --- a/test/test.bankstatements.combined.ts +++ b/test/test.bankstatements.combined.ts @@ -1,15 +1,23 @@ +/** + * Bank statement extraction test using MiniCPM-V (visual) + PaddleOCR-VL (table recognition) + * + * This is the combined/dual-VLM approach that uses both models for consensus: + * - MiniCPM-V for visual extraction + * - PaddleOCR-VL for table recognition + */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; +import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js'; // Service URLs const OLLAMA_URL = 'http://localhost:11434'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; // Models -const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0'; +const MINICPM_MODEL = 'minicpm-v:latest'; const PADDLEOCR_VL_MODEL = 'paddleocr-vl'; // Prompt for MiniCPM-V visual extraction @@ -477,11 +485,18 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin // Tests -tap.test('should connect to Ollama API', async () => { - const response = await fetch(`${OLLAMA_URL}/api/tags`); - expect(response.ok).toBeTrue(); - const data = await response.json(); - expect(data.models).toBeArray(); +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU) + const paddleOk = await ensurePaddleOcrVl(); + expect(paddleOk).toBeTrue(); + + // Ensure MiniCPM is running + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); }); tap.test('should have MiniCPM-V 4.5 model loaded', async () => { @@ -494,8 +509,7 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => { tap.test('should check PaddleOCR-VL availability', async () => { const available = await isPaddleOCRVLAvailable(); console.log(`PaddleOCR-VL available: ${available}`); - // This test passes regardless - PaddleOCR-VL is optional - expect(true).toBeTrue(); + expect(available).toBeTrue(); }); // Dynamic test for each PDF/JSON pair diff --git a/test/test.bankstatements.minicpm.ts b/test/test.bankstatements.minicpm.ts new file mode 100644 index 0000000..547dcec --- /dev/null +++ b/test/test.bankstatements.minicpm.ts @@ -0,0 +1,334 @@ +/** + * Bank statement extraction test using MiniCPM-V only (visual extraction) + * + * This tests MiniCPM-V's ability to extract bank transactions directly from images + * without any OCR augmentation. + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureMiniCpm } from './helpers/docker.js'; + +// Service URL +const OLLAMA_URL = 'http://localhost:11434'; + +// Model +const MINICPM_MODEL = 'minicpm-v:latest'; + +// Prompt for MiniCPM-V visual extraction +const MINICPM_EXTRACT_PROMPT = `/nothink +You are a bank statement parser. Extract EVERY transaction from the table. + +Read the Amount column carefully: +- "- 21,47 €" means DEBIT, output as: -21.47 +- "+ 1.000,00 €" means CREDIT, output as: 1000.00 +- European format: comma = decimal point + +For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} + +Do not skip any rows. Return ONLY the JSON array, no explanation.`; + +interface ITransaction { + date: string; + counterparty: string; + amount: number; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Extract using MiniCPM-V via Ollama + */ +async function extractWithMiniCPM(images: string[], passLabel: string): Promise { + const payload = { + model: MINICPM_MODEL, + prompt: MINICPM_EXTRACT_PROMPT, + images, + stream: true, + options: { + num_predict: 16384, + temperature: 0.1, + }, + }; + + const response = await fetch(`${OLLAMA_URL}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + let lineBuffer = ''; + + console.log(`[${passLabel}] Extracting with MiniCPM-V...`); + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.response) { + fullText += json.response; + lineBuffer += json.response; + + if (lineBuffer.includes('\n')) { + const parts = lineBuffer.split('\n'); + for (let i = 0; i < parts.length - 1; i++) { + console.log(parts[i]); + } + lineBuffer = parts[parts.length - 1]; + } + } + } catch { + // Skip invalid JSON lines + } + } + } + + if (lineBuffer) { + console.log(lineBuffer); + } + console.log(''); + + const startIdx = fullText.indexOf('['); + const endIdx = fullText.lastIndexOf(']') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + throw new Error('No JSON array found in response'); + } + + return JSON.parse(fullText.substring(startIdx, endIdx)); +} + +/** + * Create a hash of transactions for comparison + */ +function hashTransactions(transactions: ITransaction[]): string { + return transactions + .map((t) => `${t.date}|${t.amount.toFixed(2)}`) + .sort() + .join(';'); +} + +/** + * Extract with consensus voting using MiniCPM-V only + */ +async function extractWithConsensus( + images: string[], + maxPasses: number = 5 +): Promise { + const results: Array<{ transactions: ITransaction[]; hash: string }> = []; + const hashCounts: Map = new Map(); + + const addResult = (transactions: ITransaction[], passLabel: string): number => { + const hash = hashTransactions(transactions); + results.push({ transactions, hash }); + hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); + console.log( + `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)` + ); + return hashCounts.get(hash)!; + }; + + console.log('[Setup] Using MiniCPM-V only'); + + for (let pass = 1; pass <= maxPasses; pass++) { + try { + const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`); + const count = addResult(transactions, `Pass ${pass} MiniCPM-V`); + + if (count >= 2) { + console.log(`[Consensus] Reached after ${pass} passes`); + return transactions; + } + + console.log(`[Pass ${pass}] No consensus yet, trying again...`); + } catch (err) { + console.log(`[Pass ${pass}] Error: ${err}`); + } + } + + // No consensus reached - return the most common result + let bestHash = ''; + let bestCount = 0; + for (const [hash, count] of hashCounts) { + if (count > bestCount) { + bestCount = count; + bestHash = hash; + } + } + + if (!bestHash) { + throw new Error('No valid results obtained'); + } + + const best = results.find((r) => r.hash === bestHash)!; + console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); + return best.transactions; +} + +/** + * Compare extracted transactions against expected + */ +function compareTransactions( + extracted: ITransaction[], + expected: ITransaction[] +): { matches: number; total: number; errors: string[] } { + const errors: string[] = []; + let matches = 0; + + for (let i = 0; i < expected.length; i++) { + const exp = expected[i]; + const ext = extracted[i]; + + if (!ext) { + errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`); + continue; + } + + const dateMatch = ext.date === exp.date; + const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01; + + if (dateMatch && amountMatch) { + matches++; + } else { + errors.push( + `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}` + ); + } + } + + if (extracted.length > expected.length) { + errors.push(`Extra transactions: ${extracted.length - expected.length}`); + } + + return { matches, total: expected.length, errors }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f: string) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + return testCases; +} + +// Tests + +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure MiniCPM is running + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +tap.test('should have MiniCPM-V 4.5 model loaded', async () => { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + const data = await response.json(); + const modelNames = data.models.map((m: { name: string }) => m.name); + expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); +}); + +// Dynamic test for each PDF/JSON pair +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V only)\n`); + +for (const testCase of testCases) { + tap.test(`should extract transactions from ${testCase.name}`, async () => { + // Load expected transactions + const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.length} transactions`); + + // Convert PDF to images + console.log('Converting PDF to images...'); + const images = convertPdfToImages(testCase.pdfPath); + console.log(`Converted: ${images.length} pages\n`); + + // Extract with consensus (MiniCPM-V only) + const extracted = await extractWithConsensus(images); + console.log(`\nFinal: ${extracted.length} transactions`); + + // Compare results + const result = compareTransactions(extracted, expected); + console.log(`Accuracy: ${result.matches}/${result.total}`); + + if (result.errors.length > 0) { + console.log('Errors:'); + result.errors.forEach((e) => console.log(` - ${e}`)); + } + + // Assert high accuracy + const accuracy = result.matches / result.total; + expect(accuracy).toBeGreaterThan(0.95); + expect(extracted.length).toEqual(expected.length); + }); +} + +export default tap.start(); diff --git a/test/test.bankstatements.paddleocr-vl.ts b/test/test.bankstatements.paddleocr-vl.ts new file mode 100644 index 0000000..873e998 --- /dev/null +++ b/test/test.bankstatements.paddleocr-vl.ts @@ -0,0 +1,346 @@ +/** + * Bank statement extraction test using PaddleOCR-VL Full Pipeline + * + * This tests the complete PaddleOCR-VL pipeline for bank statements: + * 1. PP-DocLayoutV2 for layout detection + * 2. PaddleOCR-VL for recognition (tables with proper structure) + * 3. Structured Markdown output with tables + * 4. MiniCPM extracts transactions from structured tables + * + * The structured Markdown has properly formatted tables, + * making it much easier for MiniCPM to extract transaction data. + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; + +const PADDLEOCR_VL_URL = 'http://localhost:8000'; +const OLLAMA_URL = 'http://localhost:11434'; +const MINICPM_MODEL = 'minicpm-v:latest'; + +interface ITransaction { + date: string; + counterparty: string; + amount: number; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown) + */ +async function parseDocument(imageBase64: string): Promise { + const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + image: imageBase64, + output_format: 'markdown', + }), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); + } + + const data = await response.json(); + + if (!data.success) { + throw new Error(`PaddleOCR-VL error: ${data.error}`); + } + + return data.result?.markdown || ''; +} + +/** + * Extract transactions from structured Markdown using MiniCPM + */ +async function extractTransactionsFromMarkdown(markdown: string): Promise { + console.log(` [Extract] Processing ${markdown.length} chars of Markdown`); + + const prompt = `/nothink +Convert this bank statement to a JSON array of transactions. + +Read the Amount values carefully: +- "- 21,47 €" means DEBIT, output as: -21.47 +- "+ 1.000,00 €" means CREDIT, output as: 1000.00 +- European format: comma = decimal point, dot = thousands + +For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} + +Return ONLY the JSON array, no explanation. + +Document: +${markdown}`; + + const payload = { + model: MINICPM_MODEL, + prompt, + stream: true, + options: { + num_predict: 16384, + temperature: 0.1, + }, + }; + + const response = await fetch(`${OLLAMA_URL}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.response) { + fullText += json.response; + } + } catch { + // Skip invalid JSON lines + } + } + } + + // Extract JSON array from response + const startIdx = fullText.indexOf('['); + const endIdx = fullText.lastIndexOf(']') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + throw new Error(`No JSON array found in response: ${fullText.substring(0, 200)}`); + } + + const jsonStr = fullText.substring(startIdx, endIdx); + return JSON.parse(jsonStr); +} + +/** + * Extract transactions from all pages of a bank statement + */ +async function extractAllTransactions(images: string[]): Promise { + const allTransactions: ITransaction[] = []; + + for (let i = 0; i < images.length; i++) { + console.log(` Processing page ${i + 1}/${images.length}...`); + + // Parse with full pipeline + const markdown = await parseDocument(images[i]); + console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); + + // Extract transactions + try { + const transactions = await extractTransactionsFromMarkdown(markdown); + console.log(` [Extracted] ${transactions.length} transactions`); + allTransactions.push(...transactions); + } catch (err) { + console.log(` [Error] ${err}`); + } + } + + return allTransactions; +} + +/** + * Compare transactions - find matching transaction in expected list + */ +function findMatchingTransaction( + tx: ITransaction, + expectedList: ITransaction[] +): ITransaction | undefined { + return expectedList.find((exp) => { + const dateMatch = tx.date === exp.date; + const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; + const counterpartyMatch = + tx.counterparty?.toLowerCase().includes(exp.counterparty?.toLowerCase().slice(0, 10)) || + exp.counterparty?.toLowerCase().includes(tx.counterparty?.toLowerCase().slice(0, 10)); + return dateMatch && amountMatch && counterpartyMatch; + }); +} + +/** + * Calculate extraction accuracy + */ +function calculateAccuracy( + extracted: ITransaction[], + expected: ITransaction[] +): { matched: number; total: number; accuracy: number } { + let matched = 0; + const usedExpected = new Set(); + + for (const tx of extracted) { + for (let i = 0; i < expected.length; i++) { + if (usedExpected.has(i)) continue; + + const exp = expected[i]; + const dateMatch = tx.date === exp.date; + const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02; + + if (dateMatch && amountMatch) { + matched++; + usedExpected.add(i); + break; + } + } + } + + return { + matched, + total: expected.length, + accuracy: expected.length > 0 ? (matched / expected.length) * 100 : 0, + }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/bankstatements/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit/bankstatements'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + testCases.sort((a, b) => a.name.localeCompare(b.name)); + return testCases; +} + +// Tests + +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure PaddleOCR-VL Full Pipeline is running + const paddleOk = await ensurePaddleOcrVlFull(); + expect(paddleOk).toBeTrue(); + + // Ensure MiniCPM is running (for field extraction from Markdown) + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +// Dynamic test for each PDF/JSON pair +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} bank statement test cases (PaddleOCR-VL Full Pipeline)\n`); + +const results: Array<{ name: string; accuracy: number; matched: number; total: number }> = []; + +for (const testCase of testCases) { + tap.test(`should extract bank statement: ${testCase.name}`, async () => { + // Load expected data + const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.length} transactions`); + + const startTime = Date.now(); + + // Convert PDF to images + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + // Extract all transactions + const extracted = await extractAllTransactions(images); + + const endTime = Date.now(); + const elapsedMs = endTime - startTime; + + // Calculate accuracy + const accuracy = calculateAccuracy(extracted, expected); + results.push({ + name: testCase.name, + accuracy: accuracy.accuracy, + matched: accuracy.matched, + total: accuracy.total, + }); + + console.log(` Extracted: ${extracted.length} transactions`); + console.log(` Matched: ${accuracy.matched}/${accuracy.total} (${accuracy.accuracy.toFixed(1)}%)`); + console.log(` Time: ${(elapsedMs / 1000).toFixed(1)}s`); + + // We expect at least 50% accuracy + expect(accuracy.accuracy).toBeGreaterThan(50); + }); +} + +tap.test('summary', async () => { + const totalStatements = results.length; + const avgAccuracy = + results.length > 0 ? results.reduce((a, b) => a + b.accuracy, 0) / results.length : 0; + const totalMatched = results.reduce((a, b) => a + b.matched, 0); + const totalExpected = results.reduce((a, b) => a + b.total, 0); + + console.log(`\n======================================================`); + console.log(` Bank Statement Extraction Summary (PaddleOCR-VL Full)`); + console.log(`======================================================`); + console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); + console.log(` Statements: ${totalStatements}`); + console.log(` Transactions: ${totalMatched}/${totalExpected} matched`); + console.log(` Avg accuracy: ${avgAccuracy.toFixed(1)}%`); + console.log(`======================================================\n`); +}); + +export default tap.start(); diff --git a/test/test.invoices.combined.ts b/test/test.invoices.combined.ts index 8ef8cdc..9e6bf70 100644 --- a/test/test.invoices.combined.ts +++ b/test/test.invoices.combined.ts @@ -1,11 +1,19 @@ +/** + * Invoice extraction test using MiniCPM-V (visual) + PaddleOCR-VL (OCR augmentation) + * + * This is the combined approach that uses both models for best accuracy: + * - MiniCPM-V for visual understanding + * - PaddleOCR-VL for OCR text to augment prompts + */ import { tap, expect } from '@git.zone/tstest/tapbundle'; import * as fs from 'fs'; import * as path from 'path'; import { execSync } from 'child_process'; import * as os from 'os'; +import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js'; const OLLAMA_URL = 'http://localhost:11434'; -const MODEL = 'openbmb/minicpm-v4.5:q8_0'; +const MODEL = 'minicpm-v:latest'; const PADDLEOCR_VL_URL = 'http://localhost:8000'; interface IInvoice { @@ -358,11 +366,18 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin // Tests -tap.test('should connect to Ollama API', async () => { - const response = await fetch(`${OLLAMA_URL}/api/tags`); - expect(response.ok).toBeTrue(); - const data = await response.json(); - expect(data.models).toBeArray(); +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU) + const paddleOk = await ensurePaddleOcrVl(); + expect(paddleOk).toBeTrue(); + + // Ensure MiniCPM is running + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); }); tap.test('should have MiniCPM-V 4.5 model loaded', async () => { diff --git a/test/test.invoices.minicpm.ts b/test/test.invoices.minicpm.ts new file mode 100644 index 0000000..b3875d5 --- /dev/null +++ b/test/test.invoices.minicpm.ts @@ -0,0 +1,345 @@ +/** + * Invoice extraction test using MiniCPM-V only (visual extraction) + * + * This tests MiniCPM-V's ability to extract invoice data directly from images + * without any OCR augmentation. + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensureMiniCpm } from './helpers/docker.js'; + +const OLLAMA_URL = 'http://localhost:11434'; +const MODEL = 'minicpm-v:latest'; + +interface IInvoice { + invoice_number: string; + invoice_date: string; + vendor_name: string; + currency: string; + net_amount: number; + vat_amount: number; + total_amount: number; +} + +/** + * Build extraction prompt (MiniCPM-V only, no OCR augmentation) + */ +function buildPrompt(): string { + return `/nothink +You are an invoice parser. Extract the following fields from this invoice: + +1. invoice_number: The invoice/receipt number +2. invoice_date: Date in YYYY-MM-DD format +3. vendor_name: Company that issued the invoice +4. currency: EUR, USD, etc. +5. net_amount: Amount before tax (if shown) +6. vat_amount: Tax/VAT amount (if shown, 0 if reverse charge or no tax) +7. total_amount: Final amount due + +Return ONLY valid JSON in this exact format: +{"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company Name","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00} + +If a field is not visible, use null for strings or 0 for numbers. +No explanation, just the JSON object.`; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Single extraction pass with MiniCPM-V + */ +async function extractOnce(images: string[], passNum: number): Promise { + const payload = { + model: MODEL, + prompt: buildPrompt(), + images, + stream: true, + options: { + num_predict: 2048, + temperature: 0.1, + }, + }; + + const response = await fetch(`${OLLAMA_URL}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.response) { + fullText += json.response; + } + } catch { + // Skip invalid JSON lines + } + } + } + + // Extract JSON from response + const startIdx = fullText.indexOf('{'); + const endIdx = fullText.lastIndexOf('}') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); + } + + const jsonStr = fullText.substring(startIdx, endIdx); + return JSON.parse(jsonStr); +} + +/** + * Create a hash of invoice for comparison (using key fields) + */ +function hashInvoice(invoice: IInvoice): string { + return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; +} + +/** + * Extract with consensus voting using MiniCPM-V only + */ +async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { + const results: Array<{ invoice: IInvoice; hash: string }> = []; + const hashCounts: Map = new Map(); + + const addResult = (invoice: IInvoice, passLabel: string): number => { + const hash = hashInvoice(invoice); + results.push({ invoice, hash }); + hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); + console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); + return hashCounts.get(hash)!; + }; + + for (let pass = 1; pass <= maxPasses; pass++) { + try { + const invoice = await extractOnce(images, pass); + const count = addResult(invoice, `Pass ${pass}`); + + if (count >= 2) { + console.log(` [Consensus] Reached after ${pass} passes`); + return invoice; + } + } catch (err) { + console.log(` [Pass ${pass}] Error: ${err}`); + } + } + + // No consensus reached - return the most common result + let bestHash = ''; + let bestCount = 0; + for (const [hash, count] of hashCounts) { + if (count > bestCount) { + bestCount = count; + bestHash = hash; + } + } + + if (!bestHash) { + throw new Error(`No valid results for ${invoiceName}`); + } + + const best = results.find((r) => r.hash === bestHash)!; + console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); + return best.invoice; +} + +/** + * Compare extracted invoice against expected + */ +function compareInvoice( + extracted: IInvoice, + expected: IInvoice +): { match: boolean; errors: string[] } { + const errors: string[] = []; + + // Compare invoice number (normalize by removing spaces and case) + const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + if (extNum !== expNum) { + errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); + } + + // Compare date + if (extracted.invoice_date !== expected.invoice_date) { + errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); + } + + // Compare total amount (with tolerance) + if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { + errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); + } + + // Compare currency + if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { + errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); + } + + return { match: errors.length === 0, errors }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit/invoices'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + // Sort alphabetically + testCases.sort((a, b) => a.name.localeCompare(b.name)); + + return testCases; +} + +// Tests + +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure MiniCPM is running + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +tap.test('should have MiniCPM-V 4.5 model loaded', async () => { + const response = await fetch(`${OLLAMA_URL}/api/tags`); + const data = await response.json(); + const modelNames = data.models.map((m: { name: string }) => m.name); + expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); +}); + +// Dynamic test for each PDF/JSON pair +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} invoice test cases (MiniCPM-V only)\n`); + +let passedCount = 0; +let failedCount = 0; +const processingTimes: number[] = []; + +for (const testCase of testCases) { + tap.test(`should extract invoice: ${testCase.name}`, async () => { + // Load expected data + const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); + + const startTime = Date.now(); + + // Convert PDF to images + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + // Extract with consensus voting (MiniCPM-V only) + const extracted = await extractWithConsensus(images, testCase.name); + + const endTime = Date.now(); + const elapsedMs = endTime - startTime; + processingTimes.push(elapsedMs); + + // Compare results + const result = compareInvoice(extracted, expected); + + if (result.match) { + passedCount++; + console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + } else { + failedCount++; + console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + result.errors.forEach((e) => console.log(` - ${e}`)); + } + + // Assert match + expect(result.match).toBeTrue(); + }); +} + +tap.test('summary', async () => { + const totalInvoices = testCases.length; + const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; + const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); + const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; + const avgTimeSec = avgTimeMs / 1000; + const totalTimeSec = totalTimeMs / 1000; + + console.log(`\n========================================`); + console.log(` Invoice Extraction Summary (MiniCPM)`); + console.log(`========================================`); + console.log(` Passed: ${passedCount}/${totalInvoices}`); + console.log(` Failed: ${failedCount}/${totalInvoices}`); + console.log(` Accuracy: ${accuracy.toFixed(1)}%`); + console.log(`----------------------------------------`); + console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); + console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); + console.log(`========================================\n`); +}); + +export default tap.start(); diff --git a/test/test.invoices.paddleocr-vl.ts b/test/test.invoices.paddleocr-vl.ts new file mode 100644 index 0000000..7c2ff31 --- /dev/null +++ b/test/test.invoices.paddleocr-vl.ts @@ -0,0 +1,393 @@ +/** + * Invoice extraction test using PaddleOCR-VL Full Pipeline + * + * This tests the complete PaddleOCR-VL pipeline: + * 1. PP-DocLayoutV2 for layout detection + * 2. PaddleOCR-VL for recognition + * 3. Structured Markdown output + * 4. MiniCPM extracts invoice fields from structured Markdown + * + * The structured Markdown has proper tables and formatting, + * making it much easier for MiniCPM to extract invoice data. + */ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js'; + +const PADDLEOCR_VL_URL = 'http://localhost:8000'; +const OLLAMA_URL = 'http://localhost:11434'; +const MINICPM_MODEL = 'minicpm-v:latest'; + +interface IInvoice { + invoice_number: string; + invoice_date: string; + vendor_name: string; + currency: string; + net_amount: number; + vat_amount: number; + total_amount: number; +} + +/** + * Convert PDF to PNG images using ImageMagick + */ +function convertPdfToImages(pdfPath: string): string[] { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPattern = path.join(tempDir, 'page-%d.png'); + + try { + execSync( + `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`, + { stdio: 'pipe' } + ); + + const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); + const images: string[] = []; + + for (const file of files) { + const imagePath = path.join(tempDir, file); + const imageData = fs.readFileSync(imagePath); + images.push(imageData.toString('base64')); + } + + return images; + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown) + */ +async function parseDocument(imageBase64: string): Promise { + const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + image: imageBase64, + output_format: 'markdown', + }), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`); + } + + const data = await response.json(); + + if (!data.success) { + throw new Error(`PaddleOCR-VL error: ${data.error}`); + } + + return data.result?.markdown || ''; +} + +/** + * Extract invoice fields from structured Markdown using MiniCPM with image context + */ +async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise { + // Truncate if too long + const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown; + console.log(` [Extract] Processing ${truncated.length} chars of Markdown`); + + const prompt = `/nothink +You are an invoice parser. Extract fields from this invoice image. + +Required fields: +- invoice_number: The invoice/receipt number +- invoice_date: Date in YYYY-MM-DD format +- vendor_name: Company that issued the invoice +- currency: EUR, USD, etc. +- net_amount: Amount before tax +- vat_amount: Tax/VAT amount (0 if reverse charge) +- total_amount: Final amount due + +Return ONLY a JSON object like: +{"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99} + +Use null for missing strings, 0 for missing numbers. No explanation. + +OCR text from the invoice (for reference): +--- +${truncated} +---`; + + const payload = { + model: MINICPM_MODEL, + prompt, + images, // Send the actual image to MiniCPM + stream: true, + options: { + num_predict: 2048, + temperature: 0.1, + }, + }; + + const response = await fetch(`${OLLAMA_URL}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) { + throw new Error('No response body'); + } + + const decoder = new TextDecoder(); + let fullText = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value, { stream: true }); + const lines = chunk.split('\n').filter((l) => l.trim()); + + for (const line of lines) { + try { + const json = JSON.parse(line); + if (json.response) { + fullText += json.response; + } + } catch { + // Skip invalid JSON lines + } + } + } + + // Extract JSON from response + const startIdx = fullText.indexOf('{'); + const endIdx = fullText.lastIndexOf('}') + 1; + + if (startIdx < 0 || endIdx <= startIdx) { + throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`); + } + + const jsonStr = fullText.substring(startIdx, endIdx); + return JSON.parse(jsonStr); +} + +/** + * Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM + */ +async function extractOnce(images: string[], passNum: number): Promise { + // Parse document with full pipeline + const markdown = await parseDocument(images[0]); + console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`); + + // Extract invoice fields from Markdown with image context + return extractInvoiceFromMarkdown(markdown, images); +} + +/** + * Create a hash of invoice for comparison (using key fields) + */ +function hashInvoice(invoice: IInvoice): string { + return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`; +} + +/** + * Extract with consensus voting + */ +async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { + const results: Array<{ invoice: IInvoice; hash: string }> = []; + const hashCounts: Map = new Map(); + + const addResult = (invoice: IInvoice, passLabel: string): number => { + const hash = hashInvoice(invoice); + results.push({ invoice, hash }); + hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); + console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); + return hashCounts.get(hash)!; + }; + + for (let pass = 1; pass <= maxPasses; pass++) { + try { + const invoice = await extractOnce(images, pass); + const count = addResult(invoice, `Pass ${pass}`); + + if (count >= 2) { + console.log(` [Consensus] Reached after ${pass} passes`); + return invoice; + } + } catch (err) { + console.log(` [Pass ${pass}] Error: ${err}`); + } + } + + // No consensus reached - return the most common result + let bestHash = ''; + let bestCount = 0; + for (const [hash, count] of hashCounts) { + if (count > bestCount) { + bestCount = count; + bestHash = hash; + } + } + + if (!bestHash) { + throw new Error(`No valid results for ${invoiceName}`); + } + + const best = results.find((r) => r.hash === bestHash)!; + console.log(` [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`); + return best.invoice; +} + +/** + * Compare extracted invoice against expected + */ +function compareInvoice( + extracted: IInvoice, + expected: IInvoice +): { match: boolean; errors: string[] } { + const errors: string[] = []; + + // Compare invoice number (normalize by removing spaces and case) + const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || ''; + if (extNum !== expNum) { + errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`); + } + + // Compare date + if (extracted.invoice_date !== expected.invoice_date) { + errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`); + } + + // Compare total amount (with tolerance) + if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) { + errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`); + } + + // Compare currency + if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) { + errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`); + } + + return { match: errors.length === 0, errors }; +} + +/** + * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ + */ +function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { + const testDir = path.join(process.cwd(), '.nogit/invoices'); + if (!fs.existsSync(testDir)) { + return []; + } + + const files = fs.readdirSync(testDir); + const pdfFiles = files.filter((f) => f.endsWith('.pdf')); + const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; + + for (const pdf of pdfFiles) { + const baseName = pdf.replace('.pdf', ''); + const jsonFile = `${baseName}.json`; + if (files.includes(jsonFile)) { + testCases.push({ + name: baseName, + pdfPath: path.join(testDir, pdf), + jsonPath: path.join(testDir, jsonFile), + }); + } + } + + // Sort alphabetically + testCases.sort((a, b) => a.name.localeCompare(b.name)); + + return testCases; +} + +// Tests + +tap.test('setup: ensure Docker containers are running', async () => { + console.log('\n[Setup] Checking Docker containers...\n'); + + // Ensure PaddleOCR-VL Full Pipeline is running + const paddleOk = await ensurePaddleOcrVlFull(); + expect(paddleOk).toBeTrue(); + + // Ensure MiniCPM is running (for field extraction from Markdown) + const minicpmOk = await ensureMiniCpm(); + expect(minicpmOk).toBeTrue(); + + console.log('\n[Setup] All containers ready!\n'); +}); + +// Dynamic test for each PDF/JSON pair +const testCases = findTestCases(); +console.log(`\nFound ${testCases.length} invoice test cases (PaddleOCR-VL Full Pipeline)\n`); + +let passedCount = 0; +let failedCount = 0; +const processingTimes: number[] = []; + +for (const testCase of testCases) { + tap.test(`should extract invoice: ${testCase.name}`, async () => { + // Load expected data + const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8')); + console.log(`\n=== ${testCase.name} ===`); + console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`); + + const startTime = Date.now(); + + // Convert PDF to images + const images = convertPdfToImages(testCase.pdfPath); + console.log(` Pages: ${images.length}`); + + // Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM) + const extracted = await extractWithConsensus(images, testCase.name); + + const endTime = Date.now(); + const elapsedMs = endTime - startTime; + processingTimes.push(elapsedMs); + + // Compare results + const result = compareInvoice(extracted, expected); + + if (result.match) { + passedCount++; + console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + } else { + failedCount++; + console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`); + result.errors.forEach((e) => console.log(` - ${e}`)); + } + + // Assert match + expect(result.match).toBeTrue(); + }); +} + +tap.test('summary', async () => { + const totalInvoices = testCases.length; + const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0; + const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0); + const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0; + const avgTimeSec = avgTimeMs / 1000; + const totalTimeSec = totalTimeMs / 1000; + + console.log(`\n======================================================`); + console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`); + console.log(`======================================================`); + console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`); + console.log(` Passed: ${passedCount}/${totalInvoices}`); + console.log(` Failed: ${failedCount}/${totalInvoices}`); + console.log(` Accuracy: ${accuracy.toFixed(1)}%`); + console.log(`------------------------------------------------------`); + console.log(` Total time: ${totalTimeSec.toFixed(1)}s`); + console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`); + console.log(`======================================================\n`); +}); + +export default tap.start();