v1.6.0

feat(paddleocr-vl): add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers
2026-01-17 20:22:23 +00:00 · 2026-01-17 20:22:23 +00:00
13 changed files with 2415 additions and 22 deletions
--- a/90
+++ b/90
@@ -0,0 +1,90 @@
 # PaddleOCR-VL Full Pipeline (PP-DocLayoutV2 + PaddleOCR-VL + Structured Output)
 # Self-contained GPU image with complete document parsing pipeline
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL Full Pipeline - Layout Detection + VL Recognition + JSON/Markdown Output"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV PADDLEOCR_HOME=/root/.paddleocr
 ENV SERVER_PORT=8000
 ENV SERVER_HOST=0.0.0.0
 ENV VLM_PORT=8080
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    libsm6 \
    libxext6 \
    libxrender1 \
    curl \
    git \
    wget \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Create and activate virtual environment
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel
 # Install PaddlePaddle GPU (CUDA 12.x)
 RUN pip install --no-cache-dir \
    paddlepaddle-gpu==3.2.1 \
    --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
 # Install PaddleOCR with doc-parser (includes PP-DocLayoutV2)
 RUN pip install --no-cache-dir \
    "paddleocr[doc-parser]" \
    safetensors
 # Install PyTorch with CUDA support
 RUN pip install --no-cache-dir \
    torch==2.5.1 \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124
 # Install transformers for PaddleOCR-VL inference (no vLLM - use local inference)
 # PaddleOCR-VL requires transformers>=4.55.0 for use_kernel_forward_from_hub
 RUN pip install --no-cache-dir \
    transformers>=4.55.0 \
    accelerate \
    hf-kernels
 # Install our API server dependencies
 RUN pip install --no-cache-dir \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    httpx \
    pillow
 # Copy server files
 COPY image_support_files/paddleocr_vl_full_server.py /app/server.py
 COPY image_support_files/paddleocr_vl_full_entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh
 # Expose ports (8000 = API, 8080 = internal VLM server)
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,15 @@
 # Changelog
 ## 2026-01-17 - 1.6.0 - feat(paddleocr-vl)
 add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers
 - Add Dockerfile_paddleocr_vl_full and entrypoint script to build a GPU-enabled image with PP-DocLayoutV2 + PaddleOCR-VL and a FastAPI server
 - Introduce image_support_files/paddleocr_vl_full_server.py implementing the full pipeline API (/parse, OpenAI-compatible /v1/chat/completions) and a /formats endpoint
 - Improve image handling: decode_image supports data URLs, HTTP(S), raw base64 and file paths; add optimize_image_resolution to auto-scale images into the recommended 1080-2048px range
 - Add test helpers (test/helpers/docker.ts) to build/start/health-check Docker images and new ensurePaddleOcrVlFull workflow
 - Add comprehensive integration tests for bank statements and invoices (MiniCPM and PaddleOCR-VL variants) and update tests to ensure required containers are running before tests
 - Switch MiniCPM model references to 'minicpm-v:latest' and increase health/timeout expectations for the full pipeline
 ## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
 add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
--- a/image_support_files/paddleocr_vl_full_entrypoint.sh
+++ b/image_support_files/paddleocr_vl_full_entrypoint.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 set -e
 echo "Starting PaddleOCR-VL Full Pipeline Server (Transformers backend)..."
 # Environment
 SERVER_PORT=${SERVER_PORT:-8000}
 SERVER_HOST=${SERVER_HOST:-0.0.0.0}
 # Start our API server directly (no vLLM - uses local transformers inference)
 echo "Starting API server on port $SERVER_PORT..."
 exec python /app/server.py
--- a/image_support_files/paddleocr_vl_full_server.py
+++ b/image_support_files/paddleocr_vl_full_server.py
@@ -0,0 +1,443 @@
 #!/usr/bin/env python3
 """
 PaddleOCR-VL Full Pipeline API Server (Transformers backend)
 Provides REST API for document parsing using:
 - PP-DocLayoutV2 for layout detection
 - PaddleOCR-VL (transformers) for recognition
 - Structured JSON/Markdown output
 """
 import os
 import io
 import base64
 import logging
 import tempfile
 import time
 import json
 from typing import Optional, List, Union
 from pathlib import Path
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from PIL import Image
 import torch
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Environment configuration
 SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
 SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
 MODEL_NAME = "PaddlePaddle/PaddleOCR-VL"
 # Device configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {DEVICE}")
 # Task prompts
 TASK_PROMPTS = {
    "ocr": "OCR:",
    "table": "Table Recognition:",
    "formula": "Formula Recognition:",
    "chart": "Chart Recognition:",
 }
 # Initialize FastAPI app
 app = FastAPI(
    title="PaddleOCR-VL Full Pipeline Server",
    description="Document parsing with PP-DocLayoutV2 + PaddleOCR-VL (transformers)",
    version="1.0.0"
 )
 # Global model instances
 vl_model = None
 vl_processor = None
 layout_model = None
 def load_vl_model():
    """Load the PaddleOCR-VL model for element recognition"""
    global vl_model, vl_processor
    if vl_model is not None:
        return
    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
    from transformers import AutoModelForCausalLM, AutoProcessor
    vl_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if DEVICE == "cuda":
        vl_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
        ).to(DEVICE).eval()
    else:
        vl_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
        ).eval()
    logger.info("PaddleOCR-VL model loaded successfully")
 def load_layout_model():
    """Load the LayoutDetection model for layout detection"""
    global layout_model
    if layout_model is not None:
        return
    try:
        logger.info("Loading LayoutDetection model (PP-DocLayout_plus-L)...")
        from paddleocr import LayoutDetection
        layout_model = LayoutDetection()
        logger.info("LayoutDetection model loaded successfully")
    except Exception as e:
        logger.warning(f"Could not load LayoutDetection: {e}")
        logger.info("Falling back to VL-only mode (no layout detection)")
 def recognize_element(image: Image.Image, task: str = "ocr") -> str:
    """Recognize a single element using PaddleOCR-VL"""
    load_vl_model()
    prompt = TASK_PROMPTS.get(task, TASK_PROMPTS["ocr"])
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ]
        }
    ]
    inputs = vl_processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    if DEVICE == "cuda":
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.inference_mode():
        outputs = vl_model.generate(
            **inputs,
            max_new_tokens=4096,
            do_sample=False,
            use_cache=True
        )
    response = vl_processor.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract only the assistant's response content
    # The response format is: "User: <prompt>\nAssistant: <content>"
    # We want to extract just the content after "Assistant:"
    if "Assistant:" in response:
        parts = response.split("Assistant:")
        if len(parts) > 1:
            response = parts[-1].strip()
    elif "assistant:" in response.lower():
        # Case-insensitive fallback
        import re
        match = re.split(r'[Aa]ssistant:', response)
        if len(match) > 1:
            response = match[-1].strip()
    return response
 def detect_layout(image: Image.Image) -> List[dict]:
    """Detect layout regions in the image"""
    load_layout_model()
    if layout_model is None:
        # No layout model - return a single region covering the whole image
        return [{
            "type": "text",
            "bbox": [0, 0, image.width, image.height],
            "score": 1.0
        }]
    # Save image to temp file
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
        image.save(tmp.name, "PNG")
        tmp_path = tmp.name
    try:
        results = layout_model.predict(tmp_path)
        regions = []
        for res in results:
            # LayoutDetection returns boxes in 'boxes' key
            for box in res.get("boxes", []):
                coord = box.get("coordinate", [0, 0, image.width, image.height])
                # Convert numpy floats to regular floats
                bbox = [float(c) for c in coord]
                regions.append({
                    "type": box.get("label", "text"),
                    "bbox": bbox,
                    "score": float(box.get("score", 1.0))
                })
        # Sort regions by vertical position (top to bottom)
        regions.sort(key=lambda r: r["bbox"][1])
        return regions if regions else [{
            "type": "text",
            "bbox": [0, 0, image.width, image.height],
            "score": 1.0
        }]
    finally:
        os.unlink(tmp_path)
 def process_document(image: Image.Image) -> dict:
    """Process a document through the full pipeline"""
    logger.info(f"Processing document: {image.size}")
    # Step 1: Detect layout
    regions = detect_layout(image)
    logger.info(f"Detected {len(regions)} layout regions")
    # Step 2: Recognize each region
    blocks = []
    for i, region in enumerate(regions):
        region_type = region["type"].lower()
        bbox = region["bbox"]
        # Crop region from image
        x1, y1, x2, y2 = [int(c) for c in bbox]
        region_image = image.crop((x1, y1, x2, y2))
        # Determine task based on region type
        if "table" in region_type:
            task = "table"
        elif "formula" in region_type or "math" in region_type:
            task = "formula"
        elif "chart" in region_type or "figure" in region_type:
            task = "chart"
        else:
            task = "ocr"
        # Recognize the region
        try:
            content = recognize_element(region_image, task)
            blocks.append({
                "index": i,
                "type": region_type,
                "bbox": bbox,
                "content": content,
                "task": task
            })
            logger.info(f"  Region {i} ({region_type}): {len(content)} chars")
        except Exception as e:
            logger.error(f"  Region {i} error: {e}")
            blocks.append({
                "index": i,
                "type": region_type,
                "bbox": bbox,
                "content": "",
                "error": str(e)
            })
    return {"blocks": blocks, "image_size": list(image.size)}
 def result_to_markdown(result: dict) -> str:
    """Convert result to Markdown format"""
    lines = []
    for block in result.get("blocks", []):
        block_type = block.get("type", "text")
        content = block.get("content", "")
        if "table" in block_type.lower():
            lines.append(f"\n{content}\n")
        elif "formula" in block_type.lower():
            lines.append(f"\n$$\n{content}\n$$\n")
        else:
            lines.append(content)
    return "\n\n".join(lines)
 # Request/Response models
 class ParseRequest(BaseModel):
    image: str  # base64 encoded image
    output_format: Optional[str] = "json"
 class ParseResponse(BaseModel):
    success: bool
    format: str
    result: Union[dict, str]
    processing_time: float
    error: Optional[str] = None
 def decode_image(image_source: str) -> Image.Image:
    """Decode image from base64 or data URL"""
    if image_source.startswith("data:"):
        header, data = image_source.split(",", 1)
        image_data = base64.b64decode(data)
    else:
        image_data = base64.b64decode(image_source)
    return Image.open(io.BytesIO(image_data)).convert("RGB")
@app.on_event("startup")
 async def startup_event():
    """Pre-load models on startup"""
    logger.info("Starting PaddleOCR-VL Full Pipeline Server...")
    try:
        load_vl_model()
        load_layout_model()
        logger.info("Models loaded successfully")
    except Exception as e:
        logger.error(f"Failed to pre-load models: {e}")
@app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy" if vl_model is not None else "loading",
        "service": "PaddleOCR-VL Full Pipeline (Transformers)",
        "device": DEVICE,
        "vl_model_loaded": vl_model is not None,
        "layout_model_loaded": layout_model is not None
    }
@app.get("/formats")
 async def supported_formats():
    """List supported output formats"""
    return {
        "output_formats": ["json", "markdown"],
        "image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
        "capabilities": [
            "Layout detection (PP-DocLayoutV2)",
            "Text recognition (OCR)",
            "Table recognition",
            "Formula recognition (LaTeX)",
            "Chart recognition",
            "Multi-language support (109 languages)"
        ]
    }
@app.post("/parse", response_model=ParseResponse)
 async def parse_document_endpoint(request: ParseRequest):
    """Parse a document image and return structured output"""
    try:
        start_time = time.time()
        image = decode_image(request.image)
        result = process_document(image)
        if request.output_format == "markdown":
            markdown = result_to_markdown(result)
            output = {"markdown": markdown}
        else:
            output = result
        elapsed = time.time() - start_time
        logger.info(f"Processing complete in {elapsed:.2f}s")
        return ParseResponse(
            success=True,
            format=request.output_format,
            result=output,
            processing_time=elapsed
        )
    except Exception as e:
        logger.error(f"Error processing document: {e}", exc_info=True)
        return ParseResponse(
            success=False,
            format=request.output_format,
            result={},
            processing_time=0,
            error=str(e)
        )
@app.post("/v1/chat/completions")
 async def chat_completions(request: dict):
    """OpenAI-compatible chat completions endpoint"""
    try:
        messages = request.get("messages", [])
        output_format = request.get("output_format", "json")
        # Find user message with image
        image = None
        for msg in reversed(messages):
            if msg.get("role") == "user":
                content = msg.get("content", [])
                if isinstance(content, list):
                    for item in content:
                        if item.get("type") == "image_url":
                            url = item.get("image_url", {}).get("url", "")
                            image = decode_image(url)
                            break
                break
        if image is None:
            raise HTTPException(status_code=400, detail="No image provided")
        start_time = time.time()
        result = process_document(image)
        if output_format == "markdown":
            content = result_to_markdown(result)
        else:
            content = json.dumps(result, ensure_ascii=False, indent=2)
        elapsed = time.time() - start_time
        return {
            "id": f"chatcmpl-{int(time.time()*1000)}",
            "object": "chat.completion",
            "created": int(time.time()),
            "model": "paddleocr-vl-full",
            "choices": [{
                "index": 0,
                "message": {"role": "assistant", "content": content},
                "finish_reason": "stop"
            }],
            "usage": {
                "prompt_tokens": 100,
                "completion_tokens": len(content) // 4,
                "total_tokens": 100 + len(content) // 4
            },
            "processing_time": elapsed
        }
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in chat completions: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -136,27 +136,82 @@ def load_model():
    logger.info("PaddleOCR-VL model loaded successfully")
-def decode_image(image_source: str) -> Image.Image:
+def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image:
-    """Decode image from URL or base64"""
+    """
    Optimize image resolution for PaddleOCR-VL.
    Best results are achieved with images in the 1080p-2K range.
    - Images larger than max_size are scaled down
    - Very small images are scaled up to min_size
    """
    width, height = image.size
    max_dim = max(width, height)
    min_dim = min(width, height)
    # Scale down if too large (4K+ images often miss text)
    if max_dim > max_size:
        scale = max_size / max_dim
        new_width = int(width * scale)
        new_height = int(height * scale)
        logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}")
        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
    # Scale up if too small
    elif max_dim < min_size and min_dim < min_size:
        scale = min_size / max_dim
        new_width = int(width * scale)
        new_height = int(height * scale)
        logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}")
        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
    else:
        logger.info(f"Image size {width}x{height} is optimal, no scaling needed")
    return image
 def decode_image(image_source: str, optimize: bool = True) -> Image.Image:
    """
    Decode image from various sources.
    Supported formats:
    - Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,...
    - HTTP/HTTPS URL: https://example.com/image.png
    - Raw base64 string
    - Local file path
    Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF
    """
    image = None
    if image_source.startswith("data:"):
-        # Base64 encoded image
+        # Base64 encoded image with MIME type header
        # Supports: data:image/png;base64,... data:image/jpeg;base64,... etc.
        header, data = image_source.split(",", 1)
        image_data = base64.b64decode(data)
-        return Image.open(io.BytesIO(image_data)).convert("RGB")
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
        logger.debug(f"Decoded base64 image with header: {header}")
    elif image_source.startswith("http://") or image_source.startswith("https://"):
        # URL - fetch image
        import httpx
        response = httpx.get(image_source, timeout=30.0)
        response.raise_for_status()
-        return Image.open(io.BytesIO(response.content)).convert("RGB")
+        image = Image.open(io.BytesIO(response.content)).convert("RGB")
        logger.debug(f"Fetched image from URL: {image_source[:50]}...")
    else:
        # Assume it's a file path or raw base64
        try:
            image_data = base64.b64decode(image_source)
-            return Image.open(io.BytesIO(image_data)).convert("RGB")
+            image = Image.open(io.BytesIO(image_data)).convert("RGB")
            logger.debug("Decoded raw base64 image")
        except:
            # Try as file path
-            return Image.open(image_source).convert("RGB")
+            image = Image.open(image_source).convert("RGB")
            logger.debug(f"Loaded image from file: {image_source}")
    # Optimize resolution for best OCR results
    if optimize:
        image = optimize_image_resolution(image)
    return image
 def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
@@ -242,6 +297,45 @@ async def health_check():
    )
@app.get("/formats")
 async def supported_formats():
    """List supported image formats and input methods"""
    return {
        "image_formats": {
            "supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
            "recommended": ["PNG", "JPEG"],
            "mime_types": [
                "image/png",
                "image/jpeg",
                "image/webp",
                "image/bmp",
                "image/gif",
                "image/tiff"
            ]
        },
        "input_methods": {
            "base64_data_url": {
                "description": "Base64 encoded image with MIME type header",
                "example": "data:image/png;base64,iVBORw0KGgo..."
            },
            "http_url": {
                "description": "Direct HTTP/HTTPS URL to image",
                "example": "https://example.com/image.png"
            },
            "raw_base64": {
                "description": "Raw base64 string without header",
                "example": "iVBORw0KGgo..."
            }
        },
        "resolution": {
            "optimal_range": "1080p to 2K (1080-2048 pixels on longest side)",
            "auto_scaling": True,
            "note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy."
        },
        "task_prompts": TASK_PROMPTS
    }
@app.get("/v1/models")
 async def list_models():
    """List available models (OpenAI-compatible)"""
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@host.today/ht-docker-ai",
-  "version": "1.5.0",
+  "version": "1.6.0",
  "type": "module",
  "private": false,
  "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
--- a/test/helpers/docker.ts
+++ b/test/helpers/docker.ts
@@ -0,0 +1,297 @@
 import { execSync } from 'child_process';
 // Project container names (only manage these)
 const PROJECT_CONTAINERS = [
  'paddleocr-vl-test',
  'paddleocr-vl-gpu-test',
  'paddleocr-vl-cpu-test',
  'paddleocr-vl-full-test',
  'minicpm-test',
 ];
 // Image configurations
 export interface IImageConfig {
  name: string;
  dockerfile: string;
  buildContext: string;
  containerName: string;
  ports: string[];
  volumes?: string[];
  gpus?: boolean;
  healthEndpoint?: string;
  healthTimeout?: number;
 }
 export const IMAGES = {
  paddleocrVlGpu: {
    name: 'paddleocr-vl-gpu',
    dockerfile: 'Dockerfile_paddleocr_vl_gpu',
    buildContext: '.',
    containerName: 'paddleocr-vl-test',
    ports: ['8000:8000'],
    volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
    gpus: true,
    healthEndpoint: 'http://localhost:8000/health',
    healthTimeout: 300000, // 5 minutes for model loading
  } as IImageConfig,
  paddleocrVlCpu: {
    name: 'paddleocr-vl-cpu',
    dockerfile: 'Dockerfile_paddleocr_vl_cpu',
    buildContext: '.',
    containerName: 'paddleocr-vl-test',
    ports: ['8000:8000'],
    volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
    gpus: false,
    healthEndpoint: 'http://localhost:8000/health',
    healthTimeout: 300000,
  } as IImageConfig,
  minicpm: {
    name: 'minicpm45v',
    dockerfile: 'Dockerfile_minicpm45v',
    buildContext: '.',
    containerName: 'minicpm-test',
    ports: ['11434:11434'],
    volumes: ['ht-ollama-models:/root/.ollama'],
    gpus: true,
    healthEndpoint: 'http://localhost:11434/api/tags',
    healthTimeout: 120000,
  } as IImageConfig,
  // Full PaddleOCR-VL pipeline with PP-DocLayoutV2 + structured JSON output
  paddleocrVlFull: {
    name: 'paddleocr-vl-full',
    dockerfile: 'Dockerfile_paddleocr_vl_full',
    buildContext: '.',
    containerName: 'paddleocr-vl-full-test',
    ports: ['8000:8000'],
    volumes: [
      'ht-huggingface-cache:/root/.cache/huggingface',
      'ht-paddleocr-cache:/root/.paddleocr',
    ],
    gpus: true,
    healthEndpoint: 'http://localhost:8000/health',
    healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2)
  } as IImageConfig,
 };
 /**
 * Execute a shell command and return output
 */
 function exec(command: string, silent = false): string {
  try {
    return execSync(command, {
      encoding: 'utf-8',
      stdio: silent ? 'pipe' : 'inherit',
    });
  } catch (err: unknown) {
    if (silent) return '';
    throw err;
  }
 }
 /**
 * Check if a Docker image exists locally
 */
 export function imageExists(imageName: string): boolean {
  const result = exec(`docker images -q ${imageName}`, true);
  return result.trim().length > 0;
 }
 /**
 * Check if a container is running
 */
 export function isContainerRunning(containerName: string): boolean {
  const result = exec(`docker ps --filter "name=^${containerName}$" --format "{{.Names}}"`, true);
  return result.trim() === containerName;
 }
 /**
 * Check if a container exists (running or stopped)
 */
 export function containerExists(containerName: string): boolean {
  const result = exec(`docker ps -a --filter "name=^${containerName}$" --format "{{.Names}}"`, true);
  return result.trim() === containerName;
 }
 /**
 * Stop and remove a container
 */
 export function removeContainer(containerName: string): void {
  if (containerExists(containerName)) {
    console.log(`[Docker] Removing container: ${containerName}`);
    exec(`docker rm -f ${containerName}`, true);
  }
 }
 /**
 * Stop all project containers that conflict with the required one
 */
 export function stopConflictingContainers(requiredContainer: string, requiredPort: string): void {
  // Stop project containers using the same port
  for (const container of PROJECT_CONTAINERS) {
    if (container === requiredContainer) continue;
    if (isContainerRunning(container)) {
      // Check if this container uses the same port
      const ports = exec(`docker port ${container} 2>/dev/null || true`, true);
      if (ports.includes(requiredPort.split(':')[0])) {
        console.log(`[Docker] Stopping conflicting container: ${container}`);
        exec(`docker stop ${container}`, true);
      }
    }
  }
 }
 /**
 * Build a Docker image
 */
 export function buildImage(config: IImageConfig): void {
  console.log(`[Docker] Building image: ${config.name}`);
  const cmd = `docker build --load -f ${config.dockerfile} -t ${config.name} ${config.buildContext}`;
  exec(cmd);
 }
 /**
 * Start a container from an image
 */
 export function startContainer(config: IImageConfig): void {
  // Remove existing container if it exists
  removeContainer(config.containerName);
  console.log(`[Docker] Starting container: ${config.containerName}`);
  const portArgs = config.ports.map((p) => `-p ${p}`).join(' ');
  const volumeArgs = config.volumes?.map((v) => `-v ${v}`).join(' ') || '';
  const gpuArgs = config.gpus ? '--gpus all' : '';
  const cmd = `docker run -d --name ${config.containerName} ${gpuArgs} ${portArgs} ${volumeArgs} ${config.name}`;
  exec(cmd);
 }
 /**
 * Wait for a container to become healthy
 */
 export async function waitForHealth(
  endpoint: string,
  timeoutMs: number = 120000,
  intervalMs: number = 5000
 ): Promise<boolean> {
  const startTime = Date.now();
  console.log(`[Docker] Waiting for health: ${endpoint}`);
  while (Date.now() - startTime < timeoutMs) {
    try {
      const response = await fetch(endpoint, {
        method: 'GET',
        signal: AbortSignal.timeout(5000),
      });
      if (response.ok) {
        console.log(`[Docker] Service healthy!`);
        return true;
      }
    } catch {
      // Service not ready yet
    }
    const elapsed = Math.round((Date.now() - startTime) / 1000);
    console.log(`[Docker] Waiting... (${elapsed}s)`);
    await new Promise((resolve) => setTimeout(resolve, intervalMs));
  }
  console.log(`[Docker] Health check timeout after ${timeoutMs / 1000}s`);
  return false;
 }
 /**
 * Ensure a service is running and healthy
 * - Builds image if missing
 * - Stops conflicting project containers
 * - Starts container if not running
 * - Waits for health check
 */
 export async function ensureService(config: IImageConfig): Promise<boolean> {
  console.log(`\n[Docker] Ensuring service: ${config.name}`);
  // Build image if it doesn't exist
  if (!imageExists(config.name)) {
    console.log(`[Docker] Image not found, building...`);
    buildImage(config);
  }
  // Stop conflicting containers on the same port
  const mainPort = config.ports[0];
  stopConflictingContainers(config.containerName, mainPort);
  // Start container if not running
  if (!isContainerRunning(config.containerName)) {
    startContainer(config);
  } else {
    console.log(`[Docker] Container already running: ${config.containerName}`);
  }
  // Wait for health
  if (config.healthEndpoint) {
    return waitForHealth(config.healthEndpoint, config.healthTimeout);
  }
  return true;
 }
 /**
 * Ensure PaddleOCR-VL GPU service is running
 */
 export async function ensurePaddleOcrVlGpu(): Promise<boolean> {
  return ensureService(IMAGES.paddleocrVlGpu);
 }
 /**
 * Ensure PaddleOCR-VL CPU service is running
 */
 export async function ensurePaddleOcrVlCpu(): Promise<boolean> {
  return ensureService(IMAGES.paddleocrVlCpu);
 }
 /**
 * Ensure MiniCPM service is running
 */
 export async function ensureMiniCpm(): Promise<boolean> {
  return ensureService(IMAGES.minicpm);
 }
 /**
 * Check if GPU is available
 */
 export function isGpuAvailable(): boolean {
  try {
    const result = exec('nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null', true);
    return result.trim().length > 0;
  } catch {
    return false;
  }
 }
 /**
 * Ensure PaddleOCR-VL service (auto-detect GPU/CPU)
 */
 export async function ensurePaddleOcrVl(): Promise<boolean> {
  if (isGpuAvailable()) {
    console.log('[Docker] GPU detected, using GPU image');
    return ensurePaddleOcrVlGpu();
  } else {
    console.log('[Docker] No GPU detected, using CPU image');
    return ensurePaddleOcrVlCpu();
  }
 }
 /**
 * Ensure PaddleOCR-VL Full Pipeline service (PP-DocLayoutV2 + structured output)
 * This is the recommended service for production use - outputs structured JSON/Markdown
 */
 export async function ensurePaddleOcrVlFull(): Promise<boolean> {
  if (!isGpuAvailable()) {
    console.log('[Docker] WARNING: Full pipeline requires GPU, but none detected');
  }
  return ensureService(IMAGES.paddleocrVlFull);
 }
--- a/test/test.bankstatements.combined.ts
+++ b/test/test.bankstatements.combined.ts
@@ -1,15 +1,23 @@
 /**
 * Bank statement extraction test using MiniCPM-V (visual) + PaddleOCR-VL (table recognition)
 *
 * This is the combined/dual-VLM approach that uses both models for consensus:
 *   - MiniCPM-V for visual extraction
 *   - PaddleOCR-VL for table recognition
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js';
 // Service URLs
 const OLLAMA_URL = 'http://localhost:11434';
 const PADDLEOCR_VL_URL = 'http://localhost:8000';
 // Models
-const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const MINICPM_MODEL = 'minicpm-v:latest';
 const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
 // Prompt for MiniCPM-V visual extraction
@@ -477,11 +485,18 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
 // Tests
-tap.test('should connect to Ollama API', async () => {
+tap.test('setup: ensure Docker containers are running', async () => {
-  const response = await fetch(`${OLLAMA_URL}/api/tags`);
+  console.log('\n[Setup] Checking Docker containers...\n');
-  expect(response.ok).toBeTrue();
+
-  const data = await response.json();
+  // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU)
-  expect(data.models).toBeArray();
+  const paddleOk = await ensurePaddleOcrVl();
  expect(paddleOk).toBeTrue();
  // Ensure MiniCPM is running
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
@@ -494,8 +509,7 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
 tap.test('should check PaddleOCR-VL availability', async () => {
  const available = await isPaddleOCRVLAvailable();
  console.log(`PaddleOCR-VL available: ${available}`);
-  // This test passes regardless - PaddleOCR-VL is optional
+  expect(available).toBeTrue();
  expect(true).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
--- a/test/test.bankstatements.minicpm.ts
+++ b/test/test.bankstatements.minicpm.ts
@@ -0,0 +1,334 @@
 /**
 * Bank statement extraction test using MiniCPM-V only (visual extraction)
 *
 * This tests MiniCPM-V's ability to extract bank transactions directly from images
 * without any OCR augmentation.
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensureMiniCpm } from './helpers/docker.js';
 // Service URL
 const OLLAMA_URL = 'http://localhost:11434';
 // Model
 const MINICPM_MODEL = 'minicpm-v:latest';
 // Prompt for MiniCPM-V visual extraction
 const MINICPM_EXTRACT_PROMPT = `/nothink
 You are a bank statement parser. Extract EVERY transaction from the table.
 Read the Amount column carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
 - "+ 1.000,00 €" means CREDIT, output as: 1000.00
 - European format: comma = decimal point
 For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Do not skip any rows. Return ONLY the JSON array, no explanation.`;
 interface ITransaction {
  date: string;
  counterparty: string;
  amount: number;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Extract using MiniCPM-V via Ollama
 */
 async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
  const payload = {
    model: MINICPM_MODEL,
    prompt: MINICPM_EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  let lineBuffer = '';
  console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
          lineBuffer += json.response;
          if (lineBuffer.includes('\n')) {
            const parts = lineBuffer.split('\n');
            for (let i = 0; i < parts.length - 1; i++) {
              console.log(parts[i]);
            }
            lineBuffer = parts[parts.length - 1];
          }
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  if (lineBuffer) {
    console.log(lineBuffer);
  }
  console.log('');
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error('No JSON array found in response');
  }
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Create a hash of transactions for comparison
 */
 function hashTransactions(transactions: ITransaction[]): string {
  return transactions
    .map((t) => `${t.date}|${t.amount.toFixed(2)}`)
    .sort()
    .join(';');
 }
 /**
 * Extract with consensus voting using MiniCPM-V only
 */
 async function extractWithConsensus(
  images: string[],
  maxPasses: number = 5
 ): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
  const addResult = (transactions: ITransaction[], passLabel: string): number => {
    const hash = hashTransactions(transactions);
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
    console.log(
      `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
    );
    return hashCounts.get(hash)!;
  };
  console.log('[Setup] Using MiniCPM-V only');
  for (let pass = 1; pass <= maxPasses; pass++) {
    try {
      const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
      const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
      if (count >= 2) {
        console.log(`[Consensus] Reached after ${pass} passes`);
        return transactions;
      }
      console.log(`[Pass ${pass}] No consensus yet, trying again...`);
    } catch (err) {
      console.log(`[Pass ${pass}] Error: ${err}`);
    }
  }
  // No consensus reached - return the most common result
  let bestHash = '';
  let bestCount = 0;
  for (const [hash, count] of hashCounts) {
    if (count > bestCount) {
      bestCount = count;
      bestHash = hash;
    }
  }
  if (!bestHash) {
    throw new Error('No valid results obtained');
  }
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.transactions;
 }
 /**
 * Compare extracted transactions against expected
 */
 function compareTransactions(
  extracted: ITransaction[],
  expected: ITransaction[]
 ): { matches: number; total: number; errors: string[] } {
  const errors: string[] = [];
  let matches = 0;
  for (let i = 0; i < expected.length; i++) {
    const exp = expected[i];
    const ext = extracted[i];
    if (!ext) {
      errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
      continue;
    }
    const dateMatch = ext.date === exp.date;
    const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
    if (dateMatch && amountMatch) {
      matches++;
    } else {
      errors.push(
        `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
      );
    }
  }
  if (extracted.length > expected.length) {
    errors.push(`Extra transactions: ${extracted.length - expected.length}`);
  }
  return { matches, total: expected.length, errors };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  return testCases;
 }
 // Tests
 tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] Checking Docker containers...\n');
  // Ensure MiniCPM is running
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  const data = await response.json();
  const modelNames = data.models.map((m: { name: string }) => m.name);
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 console.log(`\nFound ${testCases.length} bank statement test cases (MiniCPM-V only)\n`);
 for (const testCase of testCases) {
  tap.test(`should extract transactions from ${testCase.name}`, async () => {
    // Load expected transactions
    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.length} transactions`);
    // Convert PDF to images
    console.log('Converting PDF to images...');
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`Converted: ${images.length} pages\n`);
    // Extract with consensus (MiniCPM-V only)
    const extracted = await extractWithConsensus(images);
    console.log(`\nFinal: ${extracted.length} transactions`);
    // Compare results
    const result = compareTransactions(extracted, expected);
    console.log(`Accuracy: ${result.matches}/${result.total}`);
    if (result.errors.length > 0) {
      console.log('Errors:');
      result.errors.forEach((e) => console.log(`  - ${e}`));
    }
    // Assert high accuracy
    const accuracy = result.matches / result.total;
    expect(accuracy).toBeGreaterThan(0.95);
    expect(extracted.length).toEqual(expected.length);
  });
 }
 export default tap.start();
--- a/test/test.bankstatements.paddleocr-vl.ts
+++ b/test/test.bankstatements.paddleocr-vl.ts
@@ -0,0 +1,346 @@
 /**
 * Bank statement extraction test using PaddleOCR-VL Full Pipeline
 *
 * This tests the complete PaddleOCR-VL pipeline for bank statements:
 *   1. PP-DocLayoutV2 for layout detection
 *   2. PaddleOCR-VL for recognition (tables with proper structure)
 *   3. Structured Markdown output with tables
 *   4. MiniCPM extracts transactions from structured tables
 *
 * The structured Markdown has properly formatted tables,
 * making it much easier for MiniCPM to extract transaction data.
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js';
 const PADDLEOCR_VL_URL = 'http://localhost:8000';
 const OLLAMA_URL = 'http://localhost:11434';
 const MINICPM_MODEL = 'minicpm-v:latest';
 interface ITransaction {
  date: string;
  counterparty: string;
  amount: number;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
 */
 async function parseDocument(imageBase64: string): Promise<string> {
  const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      image: imageBase64,
      output_format: 'markdown',
    }),
  });
  if (!response.ok) {
    const text = await response.text();
    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
  }
  const data = await response.json();
  if (!data.success) {
    throw new Error(`PaddleOCR-VL error: ${data.error}`);
  }
  return data.result?.markdown || '';
 }
 /**
 * Extract transactions from structured Markdown using MiniCPM
 */
 async function extractTransactionsFromMarkdown(markdown: string): Promise<ITransaction[]> {
  console.log(`    [Extract] Processing ${markdown.length} chars of Markdown`);
  const prompt = `/nothink
 Convert this bank statement to a JSON array of transactions.
 Read the Amount values carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
 - "+ 1.000,00 €" means CREDIT, output as: 1000.00
 - European format: comma = decimal point, dot = thousands
 For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Return ONLY the JSON array, no explanation.
 Document:
 ${markdown}`;
  const payload = {
    model: MINICPM_MODEL,
    prompt,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  // Extract JSON array from response
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error(`No JSON array found in response: ${fullText.substring(0, 200)}`);
  }
  const jsonStr = fullText.substring(startIdx, endIdx);
  return JSON.parse(jsonStr);
 }
 /**
 * Extract transactions from all pages of a bank statement
 */
 async function extractAllTransactions(images: string[]): Promise<ITransaction[]> {
  const allTransactions: ITransaction[] = [];
  for (let i = 0; i < images.length; i++) {
    console.log(`  Processing page ${i + 1}/${images.length}...`);
    // Parse with full pipeline
    const markdown = await parseDocument(images[i]);
    console.log(`    [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
    // Extract transactions
    try {
      const transactions = await extractTransactionsFromMarkdown(markdown);
      console.log(`    [Extracted] ${transactions.length} transactions`);
      allTransactions.push(...transactions);
    } catch (err) {
      console.log(`    [Error] ${err}`);
    }
  }
  return allTransactions;
 }
 /**
 * Compare transactions - find matching transaction in expected list
 */
 function findMatchingTransaction(
  tx: ITransaction,
  expectedList: ITransaction[]
 ): ITransaction | undefined {
  return expectedList.find((exp) => {
    const dateMatch = tx.date === exp.date;
    const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02;
    const counterpartyMatch =
      tx.counterparty?.toLowerCase().includes(exp.counterparty?.toLowerCase().slice(0, 10)) ||
      exp.counterparty?.toLowerCase().includes(tx.counterparty?.toLowerCase().slice(0, 10));
    return dateMatch && amountMatch && counterpartyMatch;
  });
 }
 /**
 * Calculate extraction accuracy
 */
 function calculateAccuracy(
  extracted: ITransaction[],
  expected: ITransaction[]
 ): { matched: number; total: number; accuracy: number } {
  let matched = 0;
  const usedExpected = new Set<number>();
  for (const tx of extracted) {
    for (let i = 0; i < expected.length; i++) {
      if (usedExpected.has(i)) continue;
      const exp = expected[i];
      const dateMatch = tx.date === exp.date;
      const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02;
      if (dateMatch && amountMatch) {
        matched++;
        usedExpected.add(i);
        break;
      }
    }
  }
  return {
    matched,
    total: expected.length,
    accuracy: expected.length > 0 ? (matched / expected.length) * 100 : 0,
  };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/bankstatements/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit/bankstatements');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  testCases.sort((a, b) => a.name.localeCompare(b.name));
  return testCases;
 }
 // Tests
 tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] Checking Docker containers...\n');
  // Ensure PaddleOCR-VL Full Pipeline is running
  const paddleOk = await ensurePaddleOcrVlFull();
  expect(paddleOk).toBeTrue();
  // Ensure MiniCPM is running (for field extraction from Markdown)
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 console.log(`\nFound ${testCases.length} bank statement test cases (PaddleOCR-VL Full Pipeline)\n`);
 const results: Array<{ name: string; accuracy: number; matched: number; total: number }> = [];
 for (const testCase of testCases) {
  tap.test(`should extract bank statement: ${testCase.name}`, async () => {
    // Load expected data
    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.length} transactions`);
    const startTime = Date.now();
    // Convert PDF to images
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`  Pages: ${images.length}`);
    // Extract all transactions
    const extracted = await extractAllTransactions(images);
    const endTime = Date.now();
    const elapsedMs = endTime - startTime;
    // Calculate accuracy
    const accuracy = calculateAccuracy(extracted, expected);
    results.push({
      name: testCase.name,
      accuracy: accuracy.accuracy,
      matched: accuracy.matched,
      total: accuracy.total,
    });
    console.log(`  Extracted: ${extracted.length} transactions`);
    console.log(`  Matched: ${accuracy.matched}/${accuracy.total} (${accuracy.accuracy.toFixed(1)}%)`);
    console.log(`  Time: ${(elapsedMs / 1000).toFixed(1)}s`);
    // We expect at least 50% accuracy
    expect(accuracy.accuracy).toBeGreaterThan(50);
  });
 }
 tap.test('summary', async () => {
  const totalStatements = results.length;
  const avgAccuracy =
    results.length > 0 ? results.reduce((a, b) => a + b.accuracy, 0) / results.length : 0;
  const totalMatched = results.reduce((a, b) => a + b.matched, 0);
  const totalExpected = results.reduce((a, b) => a + b.total, 0);
  console.log(`\n======================================================`);
  console.log(`  Bank Statement Extraction Summary (PaddleOCR-VL Full)`);
  console.log(`======================================================`);
  console.log(`  Method:      PaddleOCR-VL Full Pipeline -> MiniCPM`);
  console.log(`  Statements:  ${totalStatements}`);
  console.log(`  Transactions: ${totalMatched}/${totalExpected} matched`);
  console.log(`  Avg accuracy: ${avgAccuracy.toFixed(1)}%`);
  console.log(`======================================================\n`);
 });
 export default tap.start();
--- a/test/test.invoices.combined.ts
+++ b/test/test.invoices.combined.ts
@@ -1,11 +1,19 @@
 /**
 * Invoice extraction test using MiniCPM-V (visual) + PaddleOCR-VL (OCR augmentation)
 *
 * This is the combined approach that uses both models for best accuracy:
 *   - MiniCPM-V for visual understanding
 *   - PaddleOCR-VL for OCR text to augment prompts
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensurePaddleOcrVl, ensureMiniCpm } from './helpers/docker.js';
 const OLLAMA_URL = 'http://localhost:11434';
-const MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const MODEL = 'minicpm-v:latest';
 const PADDLEOCR_VL_URL = 'http://localhost:8000';
 interface IInvoice {
@@ -358,11 +366,18 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
 // Tests
-tap.test('should connect to Ollama API', async () => {
+tap.test('setup: ensure Docker containers are running', async () => {
-  const response = await fetch(`${OLLAMA_URL}/api/tags`);
+  console.log('\n[Setup] Checking Docker containers...\n');
-  expect(response.ok).toBeTrue();
+
-  const data = await response.json();
+  // Ensure PaddleOCR-VL is running (auto-detects GPU/CPU)
-  expect(data.models).toBeArray();
+  const paddleOk = await ensurePaddleOcrVl();
  expect(paddleOk).toBeTrue();
  // Ensure MiniCPM is running
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
--- a/test/test.invoices.minicpm.ts
+++ b/test/test.invoices.minicpm.ts
@@ -0,0 +1,345 @@
 /**
 * Invoice extraction test using MiniCPM-V only (visual extraction)
 *
 * This tests MiniCPM-V's ability to extract invoice data directly from images
 * without any OCR augmentation.
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensureMiniCpm } from './helpers/docker.js';
 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'minicpm-v:latest';
 interface IInvoice {
  invoice_number: string;
  invoice_date: string;
  vendor_name: string;
  currency: string;
  net_amount: number;
  vat_amount: number;
  total_amount: number;
 }
 /**
 * Build extraction prompt (MiniCPM-V only, no OCR augmentation)
 */
 function buildPrompt(): string {
  return `/nothink
 You are an invoice parser. Extract the following fields from this invoice:
 1. invoice_number: The invoice/receipt number
 2. invoice_date: Date in YYYY-MM-DD format
 3. vendor_name: Company that issued the invoice
 4. currency: EUR, USD, etc.
 5. net_amount: Amount before tax (if shown)
 6. vat_amount: Tax/VAT amount (if shown, 0 if reverse charge or no tax)
 7. total_amount: Final amount due
 Return ONLY valid JSON in this exact format:
 {"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company Name","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00}
 If a field is not visible, use null for strings or 0 for numbers.
 No explanation, just the JSON object.`;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Single extraction pass with MiniCPM-V
 */
 async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
  const payload = {
    model: MODEL,
    prompt: buildPrompt(),
    images,
    stream: true,
    options: {
      num_predict: 2048,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  // Extract JSON from response
  const startIdx = fullText.indexOf('{');
  const endIdx = fullText.lastIndexOf('}') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`);
  }
  const jsonStr = fullText.substring(startIdx, endIdx);
  return JSON.parse(jsonStr);
 }
 /**
 * Create a hash of invoice for comparison (using key fields)
 */
 function hashInvoice(invoice: IInvoice): string {
  return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`;
 }
 /**
 * Extract with consensus voting using MiniCPM-V only
 */
 async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise<IInvoice> {
  const results: Array<{ invoice: IInvoice; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
  const addResult = (invoice: IInvoice, passLabel: string): number => {
    const hash = hashInvoice(invoice);
    results.push({ invoice, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
    console.log(`  [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
    return hashCounts.get(hash)!;
  };
  for (let pass = 1; pass <= maxPasses; pass++) {
    try {
      const invoice = await extractOnce(images, pass);
      const count = addResult(invoice, `Pass ${pass}`);
      if (count >= 2) {
        console.log(`  [Consensus] Reached after ${pass} passes`);
        return invoice;
      }
    } catch (err) {
      console.log(`  [Pass ${pass}] Error: ${err}`);
    }
  }
  // No consensus reached - return the most common result
  let bestHash = '';
  let bestCount = 0;
  for (const [hash, count] of hashCounts) {
    if (count > bestCount) {
      bestCount = count;
      bestHash = hash;
    }
  }
  if (!bestHash) {
    throw new Error(`No valid results for ${invoiceName}`);
  }
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`  [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.invoice;
 }
 /**
 * Compare extracted invoice against expected
 */
 function compareInvoice(
  extracted: IInvoice,
  expected: IInvoice
 ): { match: boolean; errors: string[] } {
  const errors: string[] = [];
  // Compare invoice number (normalize by removing spaces and case)
  const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
  const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
  if (extNum !== expNum) {
    errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`);
  }
  // Compare date
  if (extracted.invoice_date !== expected.invoice_date) {
    errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`);
  }
  // Compare total amount (with tolerance)
  if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) {
    errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`);
  }
  // Compare currency
  if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) {
    errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`);
  }
  return { match: errors.length === 0, errors };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/invoices/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit/invoices');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  // Sort alphabetically
  testCases.sort((a, b) => a.name.localeCompare(b.name));
  return testCases;
 }
 // Tests
 tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] Checking Docker containers...\n');
  // Ensure MiniCPM is running
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  const data = await response.json();
  const modelNames = data.models.map((m: { name: string }) => m.name);
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 console.log(`\nFound ${testCases.length} invoice test cases (MiniCPM-V only)\n`);
 let passedCount = 0;
 let failedCount = 0;
 const processingTimes: number[] = [];
 for (const testCase of testCases) {
  tap.test(`should extract invoice: ${testCase.name}`, async () => {
    // Load expected data
    const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`);
    const startTime = Date.now();
    // Convert PDF to images
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`  Pages: ${images.length}`);
    // Extract with consensus voting (MiniCPM-V only)
    const extracted = await extractWithConsensus(images, testCase.name);
    const endTime = Date.now();
    const elapsedMs = endTime - startTime;
    processingTimes.push(elapsedMs);
    // Compare results
    const result = compareInvoice(extracted, expected);
    if (result.match) {
      passedCount++;
      console.log(`  Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
    } else {
      failedCount++;
      console.log(`  Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
      result.errors.forEach((e) => console.log(`    - ${e}`));
    }
    // Assert match
    expect(result.match).toBeTrue();
  });
 }
 tap.test('summary', async () => {
  const totalInvoices = testCases.length;
  const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0;
  const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0);
  const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0;
  const avgTimeSec = avgTimeMs / 1000;
  const totalTimeSec = totalTimeMs / 1000;
  console.log(`\n========================================`);
  console.log(`   Invoice Extraction Summary (MiniCPM)`);
  console.log(`========================================`);
  console.log(`  Passed:    ${passedCount}/${totalInvoices}`);
  console.log(`  Failed:    ${failedCount}/${totalInvoices}`);
  console.log(`  Accuracy:  ${accuracy.toFixed(1)}%`);
  console.log(`----------------------------------------`);
  console.log(`  Total time:   ${totalTimeSec.toFixed(1)}s`);
  console.log(`  Avg per inv:  ${avgTimeSec.toFixed(1)}s`);
  console.log(`========================================\n`);
 });
 export default tap.start();
--- a/test/test.invoices.paddleocr-vl.ts
+++ b/test/test.invoices.paddleocr-vl.ts
@@ -0,0 +1,393 @@
 /**
 * Invoice extraction test using PaddleOCR-VL Full Pipeline
 *
 * This tests the complete PaddleOCR-VL pipeline:
 *   1. PP-DocLayoutV2 for layout detection
 *   2. PaddleOCR-VL for recognition
 *   3. Structured Markdown output
 *   4. MiniCPM extracts invoice fields from structured Markdown
 *
 * The structured Markdown has proper tables and formatting,
 * making it much easier for MiniCPM to extract invoice data.
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js';
 const PADDLEOCR_VL_URL = 'http://localhost:8000';
 const OLLAMA_URL = 'http://localhost:11434';
 const MINICPM_MODEL = 'minicpm-v:latest';
 interface IInvoice {
  invoice_number: string;
  invoice_date: string;
  vendor_name: string;
  currency: string;
  net_amount: number;
  vat_amount: number;
  total_amount: number;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 200 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
 */
 async function parseDocument(imageBase64: string): Promise<string> {
  const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      image: imageBase64,
      output_format: 'markdown',
    }),
  });
  if (!response.ok) {
    const text = await response.text();
    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
  }
  const data = await response.json();
  if (!data.success) {
    throw new Error(`PaddleOCR-VL error: ${data.error}`);
  }
  return data.result?.markdown || '';
 }
 /**
 * Extract invoice fields from structured Markdown using MiniCPM with image context
 */
 async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise<IInvoice> {
  // Truncate if too long
  const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown;
  console.log(`    [Extract] Processing ${truncated.length} chars of Markdown`);
  const prompt = `/nothink
 You are an invoice parser. Extract fields from this invoice image.
 Required fields:
 - invoice_number: The invoice/receipt number
 - invoice_date: Date in YYYY-MM-DD format
 - vendor_name: Company that issued the invoice
 - currency: EUR, USD, etc.
 - net_amount: Amount before tax
 - vat_amount: Tax/VAT amount (0 if reverse charge)
 - total_amount: Final amount due
 Return ONLY a JSON object like:
 {"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99}
 Use null for missing strings, 0 for missing numbers. No explanation.
 OCR text from the invoice (for reference):
 ---
 ${truncated}
 ---`;
  const payload = {
    model: MINICPM_MODEL,
    prompt,
    images,  // Send the actual image to MiniCPM
    stream: true,
    options: {
      num_predict: 2048,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  // Extract JSON from response
  const startIdx = fullText.indexOf('{');
  const endIdx = fullText.lastIndexOf('}') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error(`No JSON object found in response: ${fullText.substring(0, 200)}`);
  }
  const jsonStr = fullText.substring(startIdx, endIdx);
  return JSON.parse(jsonStr);
 }
 /**
 * Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM
 */
 async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
  // Parse document with full pipeline
  const markdown = await parseDocument(images[0]);
  console.log(`    [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
  // Extract invoice fields from Markdown with image context
  return extractInvoiceFromMarkdown(markdown, images);
 }
 /**
 * Create a hash of invoice for comparison (using key fields)
 */
 function hashInvoice(invoice: IInvoice): string {
  return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`;
 }
 /**
 * Extract with consensus voting
 */
 async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise<IInvoice> {
  const results: Array<{ invoice: IInvoice; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
  const addResult = (invoice: IInvoice, passLabel: string): number => {
    const hash = hashInvoice(invoice);
    results.push({ invoice, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
    console.log(`  [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
    return hashCounts.get(hash)!;
  };
  for (let pass = 1; pass <= maxPasses; pass++) {
    try {
      const invoice = await extractOnce(images, pass);
      const count = addResult(invoice, `Pass ${pass}`);
      if (count >= 2) {
        console.log(`  [Consensus] Reached after ${pass} passes`);
        return invoice;
      }
    } catch (err) {
      console.log(`  [Pass ${pass}] Error: ${err}`);
    }
  }
  // No consensus reached - return the most common result
  let bestHash = '';
  let bestCount = 0;
  for (const [hash, count] of hashCounts) {
    if (count > bestCount) {
      bestCount = count;
      bestHash = hash;
    }
  }
  if (!bestHash) {
    throw new Error(`No valid results for ${invoiceName}`);
  }
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`  [No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.invoice;
 }
 /**
 * Compare extracted invoice against expected
 */
 function compareInvoice(
  extracted: IInvoice,
  expected: IInvoice
 ): { match: boolean; errors: string[] } {
  const errors: string[] = [];
  // Compare invoice number (normalize by removing spaces and case)
  const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
  const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
  if (extNum !== expNum) {
    errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`);
  }
  // Compare date
  if (extracted.invoice_date !== expected.invoice_date) {
    errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`);
  }
  // Compare total amount (with tolerance)
  if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) {
    errors.push(`total_amount: expected ${expected.total_amount}, got ${extracted.total_amount}`);
  }
  // Compare currency
  if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) {
    errors.push(`currency: expected "${expected.currency}", got "${extracted.currency}"`);
  }
  return { match: errors.length === 0, errors };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/invoices/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit/invoices');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  // Sort alphabetically
  testCases.sort((a, b) => a.name.localeCompare(b.name));
  return testCases;
 }
 // Tests
 tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] Checking Docker containers...\n');
  // Ensure PaddleOCR-VL Full Pipeline is running
  const paddleOk = await ensurePaddleOcrVlFull();
  expect(paddleOk).toBeTrue();
  // Ensure MiniCPM is running (for field extraction from Markdown)
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();
  console.log('\n[Setup] All containers ready!\n');
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 console.log(`\nFound ${testCases.length} invoice test cases (PaddleOCR-VL Full Pipeline)\n`);
 let passedCount = 0;
 let failedCount = 0;
 const processingTimes: number[] = [];
 for (const testCase of testCases) {
  tap.test(`should extract invoice: ${testCase.name}`, async () => {
    // Load expected data
    const expected: IInvoice = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`);
    const startTime = Date.now();
    // Convert PDF to images
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`  Pages: ${images.length}`);
    // Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM)
    const extracted = await extractWithConsensus(images, testCase.name);
    const endTime = Date.now();
    const elapsedMs = endTime - startTime;
    processingTimes.push(elapsedMs);
    // Compare results
    const result = compareInvoice(extracted, expected);
    if (result.match) {
      passedCount++;
      console.log(`  Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
    } else {
      failedCount++;
      console.log(`  Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
      result.errors.forEach((e) => console.log(`    - ${e}`));
    }
    // Assert match
    expect(result.match).toBeTrue();
  });
 }
 tap.test('summary', async () => {
  const totalInvoices = testCases.length;
  const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0;
  const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0);
  const avgTimeMs = processingTimes.length > 0 ? totalTimeMs / processingTimes.length : 0;
  const avgTimeSec = avgTimeMs / 1000;
  const totalTimeSec = totalTimeMs / 1000;
  console.log(`\n======================================================`);
  console.log(`   Invoice Extraction Summary (PaddleOCR-VL Full)`);
  console.log(`======================================================`);
  console.log(`  Method:    PaddleOCR-VL Full Pipeline -> MiniCPM`);
  console.log(`  Passed:    ${passedCount}/${totalInvoices}`);
  console.log(`  Failed:    ${failedCount}/${totalInvoices}`);
  console.log(`  Accuracy:  ${accuracy.toFixed(1)}%`);
  console.log(`------------------------------------------------------`);
  console.log(`  Total time:   ${totalTimeSec.toFixed(1)}s`);
  console.log(`  Avg per inv:  ${avgTimeSec.toFixed(1)}s`);
  console.log(`======================================================\n`);
 });
 export default tap.start();
Author	SHA1	Message	Date
Juergen Kunz	311e7a8fd4	v1.6.0 Some checks failed Docker (tags) / security (push) Successful in 32s Details Docker (tags) / test (push) Failing after 40s Details Docker (tags) / release (push) Has been skipped Details Docker (tags) / metadata (push) Has been skipped Details	2026-01-17 20:22:23 +00:00
Juergen Kunz	80e6866442	feat(paddleocr-vl): add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers	2026-01-17 20:22:23 +00:00