v1.5.0

feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
update
2026-01-17 16:57:26 +00:00 · 2026-01-17 16:57:26 +00:00 · 2026-01-16 16:21:44 +00:00 · 2026-01-16 14:24:37 +00:00 · 2026-01-16 14:24:37 +00:00
20 changed files with 1547 additions and 1094 deletions
--- a/49
+++ b/49
@@ -1,49 +0,0 @@
 # PaddleOCR GPU Variant
 # OCR processing with NVIDIA GPU support using PaddlePaddle
 FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV OCR_LANGUAGE="en"
 ENV SERVER_PORT="5000"
 ENV SERVER_HOST="0.0.0.0"
 ENV PYTHONUNBUFFERED=1
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies (using stable paddleocr 2.x)
 RUN pip install --no-cache-dir \
    paddleocr==2.8.1 \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    opencv-python-headless \
    pillow
 # Copy server files
 COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
 COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
 # Note: OCR models will be downloaded on first run
 # This ensures compatibility across different GPU architectures
 # Expose API port
 EXPOSE 5000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:5000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
--- a/53
+++ b/53
@@ -1,53 +0,0 @@
 # PaddleOCR CPU Variant
 # OCR processing optimized for CPU-only inference
 FROM python:3.10-slim-bookworm
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration for CPU-only mode
 ENV OCR_LANGUAGE="en"
 ENV SERVER_PORT="5000"
 ENV SERVER_HOST="0.0.0.0"
 ENV PYTHONUNBUFFERED=1
 # Disable GPU usage for CPU-only variant
 ENV CUDA_VISIBLE_DEVICES="-1"
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
 RUN pip install --no-cache-dir \
    paddlepaddle==2.6.2 \
    paddleocr==2.8.1 \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    opencv-python-headless \
    pillow
 # Copy server files
 COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
 COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
 # Note: OCR models will be downloaded on first run
 # This avoids build-time segfaults with certain CPU architectures
 # Expose API port
 EXPOSE 5000
 # Health check (longer start-period for CPU variant)
 HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD curl -f http://localhost:5000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
--- a/70
+++ b/70
@@ -0,0 +1,70 @@
 # PaddleOCR-VL GPU Variant
 # Vision-Language Model for document parsing using vLLM
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    git \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Create and activate virtual environment
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Install PyTorch with CUDA support
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124
 # Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
 RUN pip install --no-cache-dir \
    vllm==0.11.1 \
    --extra-index-url https://download.pytorch.org/whl/cu124
 # Install additional dependencies
 RUN pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    openai \
    httpx
 # Copy entrypoint script
 COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
 # Expose vLLM API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
 # PaddleOCR-VL CPU Variant
 # Vision-Language Model for document parsing using transformers (slower, no GPU required)
 FROM python:3.11-slim-bookworm
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV CUDA_VISIBLE_DEVICES=""
 ENV SERVER_PORT=8000
 ENV SERVER_HOST=0.0.0.0
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    git \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
    pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    httpx \
    protobuf \
    sentencepiece \
    einops
 # Copy server files
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
 COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
 # Expose API port
 EXPOSE 8000
 # Health check (longer start-period for CPU + model download)
 HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"]
--- a/71
+++ b/71
@@ -0,0 +1,71 @@
 # PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
 # Vision-Language Model for document parsing using transformers with CUDA
 FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV SERVER_PORT=8000
 ENV SERVER_HOST=0.0.0.0
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    git \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Create and activate virtual environment
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Install PyTorch with CUDA support
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124
 # Install Python dependencies (transformers-based, not vLLM)
 RUN pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    httpx \
    protobuf \
    sentencepiece \
    einops
 # Copy server files (same as CPU variant - it auto-detects CUDA)
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
 COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
 # Expose API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
--- a/build-images.sh
+++ b/build-images.sh
@@ -29,19 +29,19 @@ docker build \
    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
    .
-# Build PaddleOCR GPU variant
+# Build PaddleOCR-VL GPU variant (vLLM)
-echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}"
+echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
 docker build \
-    -f Dockerfile_paddleocr \
+    -f Dockerfile_paddleocr_vl \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
    .
-# Build PaddleOCR CPU variant
+# Build PaddleOCR-VL CPU variant
-echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}"
+echo -e "${GREEN}Building PaddleOCR-VL CPU variant...${NC}"
 docker build \
-    -f Dockerfile_paddleocr_cpu \
+    -f Dockerfile_paddleocr_vl_cpu \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu \
    .
 echo -e "${GREEN}All images built successfully!${NC}"
@@ -52,7 +52,7 @@ echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
 echo ""
-echo "  PaddleOCR:"
+echo "  PaddleOCR-VL (Vision-Language Model):"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl (GPU/vLLM)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu (GPU/vLLM)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu (CPU)"
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,24 @@
 # Changelog
 ## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
 add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
 - Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
 - Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
 - Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
 - Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
 - Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
 - Add @types/node to package.json dependencies and tidy devDependencies ordering.
 ## 2026-01-16 - 1.4.0 - feat(invoices)
 add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors
 - Add hybrid pipeline documentation and examples (PaddleOCR + MiniCPM-V) and architecture diagram in recipes/document.md
 - Integrate PaddleOCR: new OCR extraction functions and OCR-only prompt flow in test/test.node.ts
 - Add consensus voting and parallel-pass optimization to improve reliability (multiple passes, hashing, and majority voting)
 - Refactor prompts and tests: introduce /nothink token, OCR truncation limits, separate visual and OCR-only prompts, and improved prompt building in test/test.invoices.ts
 - Update image conversion defaults (200 DPI, filename change) and add TypeScript helper functions for extraction and consensus handling
 ## 2026-01-16 - 1.3.0 - feat(paddleocr)
 add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows
--- a/image_support_files/paddleocr-entrypoint.sh
+++ b/image_support_files/paddleocr-entrypoint.sh
@@ -1,25 +0,0 @@
 #!/bin/bash
 set -e
 # Configuration from environment
 OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
 SERVER_PORT="${SERVER_PORT:-5000}"
 SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
 echo "Starting PaddleOCR Server..."
 echo "  Language: ${OCR_LANGUAGE}"
 echo "  Host: ${SERVER_HOST}"
 echo "  Port: ${SERVER_PORT}"
 # Check GPU availability
 if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
    echo "  GPU: Disabled (CPU mode)"
 else
    echo "  GPU: Enabled"
 fi
 # Start the FastAPI server with uvicorn
 exec python -m uvicorn paddleocr_server:app \
    --host "${SERVER_HOST}" \
    --port "${SERVER_PORT}" \
    --workers 1
--- a/image_support_files/paddleocr-vl-cpu-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-cpu-entrypoint.sh
@@ -0,0 +1,19 @@
 #!/bin/bash
 set -e
 echo "==================================="
 echo "PaddleOCR-VL Server (CPU)"
 echo "==================================="
 HOST="${SERVER_HOST:-0.0.0.0}"
 PORT="${SERVER_PORT:-8000}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Device: CPU (no GPU)"
 echo ""
 echo "Starting PaddleOCR-VL CPU server..."
 echo "==================================="
 exec python /app/paddleocr_vl_server.py
--- a/image_support_files/paddleocr-vl-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -0,0 +1,59 @@
 #!/bin/bash
 set -e
 echo "==================================="
 echo "PaddleOCR-VL Server"
 echo "==================================="
 # Configuration
 MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
 HOST="${HOST:-0.0.0.0}"
 PORT="${PORT:-8000}"
 MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
 GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
 ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
 echo "Model: ${MODEL_NAME}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
 echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
 echo "Max model length: ${MAX_MODEL_LEN}"
 echo "Enforce eager: ${ENFORCE_EAGER}"
 echo ""
 # Check GPU availability
 if command -v nvidia-smi &> /dev/null; then
    echo "GPU Information:"
    nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
    echo ""
 else
    echo "WARNING: nvidia-smi not found. GPU may not be available."
 fi
 echo "Starting vLLM server..."
 echo "==================================="
 # Build vLLM command
 VLLM_ARGS=(
    serve "${MODEL_NAME}"
    --trust-remote-code
    --host "${HOST}"
    --port "${PORT}"
    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
    --max-model-len "${MAX_MODEL_LEN}"
    --no-enable-prefix-caching
    --mm-processor-cache-gb 0
    --served-model-name "paddleocr-vl"
    --limit-mm-per-prompt '{"image": 1}'
 )
 # Add enforce-eager if enabled (disables CUDA graphs, saves memory)
 if [ "${ENFORCE_EAGER}" = "true" ]; then
    VLLM_ARGS+=(--enforce-eager)
 fi
 # Start vLLM server with PaddleOCR-VL
 exec vllm "${VLLM_ARGS[@]}"
--- a/image_support_files/paddleocr_server.py
+++ b/image_support_files/paddleocr_server.py
@@ -1,253 +0,0 @@
 #!/usr/bin/env python3
 """
 PaddleOCR FastAPI Server
 Provides REST API for OCR operations using PaddleOCR
 """
 import os
 import io
 import base64
 import logging
 from typing import Optional, List, Any
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import numpy as np
 from PIL import Image
 from paddleocr import PaddleOCR
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Environment configuration
 OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
 # GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
 USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
 # Initialize FastAPI app
 app = FastAPI(
    title="PaddleOCR Server",
    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
    version="1.0.0"
 )
 # Global OCR instance
 ocr_instance: Optional[PaddleOCR] = None
 class OCRRequest(BaseModel):
    """Request model for base64 image OCR"""
    image: str
    language: Optional[str] = None
 class BoundingBox(BaseModel):
    """Bounding box for detected text"""
    points: List[List[float]]
 class OCRResult(BaseModel):
    """Single OCR detection result"""
    text: str
    confidence: float
    box: List[List[float]]
 class OCRResponse(BaseModel):
    """OCR response model"""
    success: bool
    results: List[OCRResult]
    error: Optional[str] = None
 class HealthResponse(BaseModel):
    """Health check response"""
    status: str
    model: str
    language: str
    gpu_enabled: bool
 def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
    """Get or initialize the OCR instance"""
    global ocr_instance
    use_lang = lang or OCR_LANGUAGE
    # Return cached instance if same language
    if ocr_instance is not None and lang is None:
        return ocr_instance
    logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
    new_ocr = PaddleOCR(
        use_angle_cls=True,
        lang=use_lang,
        use_gpu=USE_GPU,
        show_log=False
    )
    # Cache the default language instance
    if lang is None:
        ocr_instance = new_ocr
    logger.info("PaddleOCR initialized successfully")
    return new_ocr
 def decode_base64_image(base64_string: str) -> np.ndarray:
    """Decode base64 string to numpy array"""
    # Remove data URL prefix if present
    if ',' in base64_string:
        base64_string = base64_string.split(',')[1]
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    # Convert to RGB if necessary
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return np.array(image)
 def process_ocr_result(result: Any) -> List[OCRResult]:
    """Process PaddleOCR result into structured format"""
    results = []
    if result is None or len(result) == 0:
        return results
    # PaddleOCR returns list of results per image
    # Each result is a list of [box, (text, confidence)]
    for line in result[0] if result[0] else []:
        if line is None:
            continue
        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        text_info = line[1]  # (text, confidence)
        results.append(OCRResult(
            text=text_info[0],
            confidence=float(text_info[1]),
            box=[[float(p[0]), float(p[1])] for p in box]
        ))
    return results
@app.on_event("startup")
 async def startup_event():
    """Pre-warm the OCR model on startup"""
    logger.info("Pre-warming OCR model...")
    try:
        ocr = get_ocr()
        # Create a small test image to warm up the model
        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
        test_image.fill(255)  # White image
        ocr.ocr(test_image, cls=True)
        logger.info("OCR model pre-warmed successfully")
    except Exception as e:
        logger.error(f"Failed to pre-warm OCR model: {e}")
@app.get("/health", response_model=HealthResponse)
 async def health_check():
    """Health check endpoint"""
    try:
        # Ensure OCR is initialized
        get_ocr()
        return HealthResponse(
            status="healthy",
            model="PP-OCRv4",
            language=OCR_LANGUAGE,
            gpu_enabled=USE_GPU
        )
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        raise HTTPException(status_code=503, detail=str(e))
@app.post("/ocr", response_model=OCRResponse)
 async def ocr_base64(request: OCRRequest):
    """
    Perform OCR on a base64-encoded image
    Args:
        request: OCRRequest with base64 image and optional language
    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Decode image
        image = decode_base64_image(request.image)
        # Get OCR instance (use request language if provided)
        if request.language and request.language != OCR_LANGUAGE:
            ocr = get_ocr(request.language)
        else:
            ocr = get_ocr()
        result = ocr.ocr(image, cls=True)
        # Process results
        results = process_ocr_result(result)
        return OCRResponse(success=True, results=results)
    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))
@app.post("/ocr/upload", response_model=OCRResponse)
 async def ocr_upload(
    img: UploadFile = File(...),
    language: Optional[str] = Form(None)
 ):
    """
    Perform OCR on an uploaded image file
    Args:
        img: Uploaded image file
        language: Optional language code (default: env OCR_LANGUAGE)
    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Read image
        contents = await img.read()
        image = Image.open(io.BytesIO(contents))
        # Convert to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image_array = np.array(image)
        # Get OCR instance
        if language and language != OCR_LANGUAGE:
            ocr = get_ocr(language)
        else:
            ocr = get_ocr()
        result = ocr.ocr(image_array, cls=True)
        # Process results
        results = process_ocr_result(result)
        return OCRResponse(success=True, results=results)
    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -0,0 +1,371 @@
 #!/usr/bin/env python3
 """
 PaddleOCR-VL FastAPI Server (CPU variant)
 Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
 """
 import os
 import io
 import base64
 import logging
 import time
 from typing import Optional, List, Any, Dict, Union
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import torch
 from PIL import Image
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Environment configuration
 SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
 SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
 MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
 # Device configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {DEVICE}")
 # Task prompts for PaddleOCR-VL
 TASK_PROMPTS = {
    "ocr": "OCR:",
    "table": "Table Recognition:",
    "formula": "Formula Recognition:",
    "chart": "Chart Recognition:",
 }
 # Initialize FastAPI app
 app = FastAPI(
    title="PaddleOCR-VL Server",
    description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
    version="1.0.0"
 )
 # Global model instances
 model = None
 processor = None
 # Request/Response models (OpenAI-compatible)
 class ImageUrl(BaseModel):
    url: str
 class ContentItem(BaseModel):
    type: str
    text: Optional[str] = None
    image_url: Optional[ImageUrl] = None
 class Message(BaseModel):
    role: str
    content: Union[str, List[ContentItem]]
 class ChatCompletionRequest(BaseModel):
    model: str = "paddleocr-vl"
    messages: List[Message]
    temperature: Optional[float] = 0.0
    max_tokens: Optional[int] = 4096
 class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: str
 class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Usage
 class HealthResponse(BaseModel):
    status: str
    model: str
    device: str
 def load_model():
    """Load the PaddleOCR-VL model and processor"""
    global model, processor
    if model is not None:
        return
    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
    from transformers import AutoModelForCausalLM, AutoProcessor
    # Load processor
    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
    # Load model with appropriate settings for CPU/GPU
    if DEVICE == "cuda":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
        ).to(DEVICE).eval()
    else:
        # CPU mode - use float32 for compatibility
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
        ).eval()
    logger.info("PaddleOCR-VL model loaded successfully")
 def decode_image(image_source: str) -> Image.Image:
    """Decode image from URL or base64"""
    if image_source.startswith("data:"):
        # Base64 encoded image
        header, data = image_source.split(",", 1)
        image_data = base64.b64decode(data)
        return Image.open(io.BytesIO(image_data)).convert("RGB")
    elif image_source.startswith("http://") or image_source.startswith("https://"):
        # URL - fetch image
        import httpx
        response = httpx.get(image_source, timeout=30.0)
        response.raise_for_status()
        return Image.open(io.BytesIO(response.content)).convert("RGB")
    else:
        # Assume it's a file path or raw base64
        try:
            image_data = base64.b64decode(image_source)
            return Image.open(io.BytesIO(image_data)).convert("RGB")
        except:
            # Try as file path
            return Image.open(image_source).convert("RGB")
 def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
    """Extract image and text prompt from message content"""
    if isinstance(content, str):
        return None, content
    image = None
    text = ""
    for item in content:
        if item.type == "image_url" and item.image_url:
            image = decode_image(item.image_url.url)
        elif item.type == "text" and item.text:
            text = item.text
    return image, text
 def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
    """Generate response using PaddleOCR-VL"""
    load_model()
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    if DEVICE == "cuda":
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            use_cache=True
        )
    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract the assistant's response (after the prompt)
    if "assistant" in response.lower():
        parts = response.split("assistant")
        if len(parts) > 1:
            response = parts[-1].strip()
    return response
@app.on_event("startup")
 async def startup_event():
    """Pre-load the model on startup"""
    logger.info("Pre-loading PaddleOCR-VL model...")
    try:
        load_model()
        logger.info("Model pre-loaded successfully")
    except Exception as e:
        logger.error(f"Failed to pre-load model: {e}")
        # Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
 async def health_check():
    """Health check endpoint"""
    return HealthResponse(
        status="healthy" if model is not None else "loading",
        model=MODEL_NAME,
        device=DEVICE
    )
@app.get("/v1/models")
 async def list_models():
    """List available models (OpenAI-compatible)"""
    return {
        "object": "list",
        "data": [
            {
                "id": "paddleocr-vl",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "paddlepaddle"
            }
        ]
    }
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def chat_completions(request: ChatCompletionRequest):
    """
    OpenAI-compatible chat completions endpoint for PaddleOCR-VL
    Supports tasks:
    - "OCR:" - Text recognition
    - "Table Recognition:" - Table extraction
    - "Formula Recognition:" - Formula extraction
    - "Chart Recognition:" - Chart extraction
    """
    try:
        # Get the last user message
        user_message = None
        for msg in reversed(request.messages):
            if msg.role == "user":
                user_message = msg
                break
        if not user_message:
            raise HTTPException(status_code=400, detail="No user message found")
        # Extract image and prompt
        image, prompt = extract_image_and_text(user_message.content)
        if image is None:
            raise HTTPException(status_code=400, detail="No image provided in message")
        # Default to OCR if no specific prompt
        if not prompt or prompt.strip() == "":
            prompt = "OCR:"
        logger.info(f"Processing request with prompt: {prompt[:50]}...")
        # Generate response
        start_time = time.time()
        response_text = generate_response(image, prompt, request.max_tokens or 4096)
        elapsed = time.time() - start_time
        logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
        # Build OpenAI-compatible response
        return ChatCompletionResponse(
            id=f"chatcmpl-{int(time.time()*1000)}",
            created=int(time.time()),
            model=request.model,
            choices=[
                Choice(
                    index=0,
                    message=Message(role="assistant", content=response_text),
                    finish_reason="stop"
                )
            ],
            usage=Usage(
                prompt_tokens=100,  # Approximate
                completion_tokens=len(response_text) // 4,
                total_tokens=100 + len(response_text) // 4
            )
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Legacy endpoint for compatibility with old PaddleOCR API
 class LegacyOCRRequest(BaseModel):
    image: str
    task: Optional[str] = "ocr"
 class LegacyOCRResponse(BaseModel):
    success: bool
    result: str
    task: str
    error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
 async def legacy_ocr(request: LegacyOCRRequest):
    """
    Legacy OCR endpoint for backwards compatibility
    Tasks: ocr, table, formula, chart
    """
    try:
        image = decode_image(request.image)
        prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
        result = generate_response(image, prompt)
        return LegacyOCRResponse(
            success=True,
            result=result,
            task=request.task
        )
    except Exception as e:
        logger.error(f"Legacy OCR error: {e}")
        return LegacyOCRResponse(
            success=False,
            result="",
            task=request.task,
            error=str(e)
        )
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@host.today/ht-docker-ai",
-  "version": "1.3.0",
+  "version": "1.5.0",
  "type": "module",
  "private": false,
  "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
@@ -13,8 +13,8 @@
    "test": "tstest test/ --verbose"
  },
  "devDependencies": {
-    "@git.zone/tstest": "^1.0.90",
+    "@git.zone/tsrun": "^1.3.3",
-    "@git.zone/tsrun": "^1.3.3"
+    "@git.zone/tstest": "^1.0.90"
  },
  "repository": {
    "type": "git",
@@ -28,5 +28,8 @@
    "minicpm",
    "ollama",
    "multimodal"
-  ]
+  ],
  "dependencies": {
    "@types/node": "^25.0.9"
  }
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -7,6 +7,10 @@ settings:
 importers:
  .:
    dependencies:
      '@types/node':
        specifier: ^25.0.9
        version: 25.0.9
    devDependencies:
      '@git.zone/tsrun':
        specifier: ^1.3.3
--- a/readme.hints.md
+++ b/readme.hints.md
@@ -77,56 +77,73 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 CPU variant has longer `start-period` (120s) due to slower startup.
-## PaddleOCR
+## PaddleOCR-VL (Recommended)
 ### Overview
-PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides:
+PaddleOCR-VL is a 0.9B parameter Vision-Language Model specifically optimized for document parsing. It replaces the older PP-Structure approach with native VLM understanding.
- Text detection and recognition
+**Key advantages over PP-Structure:**
- Multi-language support
+- Native table understanding (no HTML parsing needed)
- FastAPI REST API
+- 109 language support
- GPU and CPU variants
+- Better handling of complex multi-row tables
 - Structured Markdown/JSON output
 ### Docker Images
 | Tag | Description |
 |-----|-------------|
-| `paddleocr` | GPU variant (default) |
+| `paddleocr-vl` | GPU variant using vLLM (recommended) |
-| `paddleocr-gpu` | GPU variant (alias) |
+| `paddleocr-vl-cpu` | CPU variant using transformers |
 | `paddleocr-cpu` | CPU-only variant |
-### API Endpoints
+### API Endpoints (OpenAI-compatible)
 | Endpoint | Method | Description |
 |----------|--------|-------------|
 | `/health` | GET | Health check with model info |
-| `/ocr` | POST | OCR with base64 image (JSON body) |
+| `/v1/models` | GET | List available models |
-| `/ocr/upload` | POST | OCR with file upload (multipart form) |
+| `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
 | `/ocr` | POST | Legacy OCR endpoint |
 ### Request/Response Format
-**POST /ocr (JSON)**
+**POST /v1/chat/completions (OpenAI-compatible)**
 ```json
 {
-  "image": "<base64-encoded-image>",
+  "model": "paddleocr-vl",
-  "language": "en"  // optional
+  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
        {"type": "text", "text": "Table Recognition:"}
      ]
    }
  ],
  "temperature": 0.0,
  "max_tokens": 8192
 }
 ```
-**POST /ocr/upload (multipart)**
+**Task Prompts:**
- `img`: image file
+- `"OCR:"` - Text recognition
- `language`: optional language code
+- `"Table Recognition:"` - Table extraction (returns markdown)
 - `"Formula Recognition:"` - Formula extraction
 - `"Chart Recognition:"` - Chart extraction
 **Response**
 ```json
 {
-  "success": true,
+  "id": "chatcmpl-...",
-  "results": [
+  "object": "chat.completion",
  "choices": [
    {
-      "text": "Invoice #12345",
+      "index": 0,
-      "confidence": 0.98,
+      "message": {
-      "box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        "role": "assistant",
        "content": "| Date | Description | Amount |\n|---|---|---|\n| 2021-06-01 | GITLAB INC | -119.96 |"
      },
      "finish_reason": "stop"
    }
  ]
 }
@@ -136,19 +153,16 @@ PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It pr
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `OCR_LANGUAGE` | `en` | Default language for OCR |
+| `MODEL_NAME` | `PaddlePaddle/PaddleOCR-VL` | Model to load |
-| `SERVER_PORT` | `5000` | Server port |
+| `HOST` | `0.0.0.0` | Server host |
-| `SERVER_HOST` | `0.0.0.0` | Server host |
+| `PORT` | `8000` | Server port |
-| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only |
+| `MAX_BATCHED_TOKENS` | `16384` | vLLM max batch tokens |
 | `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory usage (0-1) |
 ### Performance
- **GPU**: ~1-3 seconds per page
+- **GPU (vLLM)**: ~2-5 seconds per page
- **CPU**: ~10-30 seconds per page
+- **CPU**: ~30-60 seconds per page
 ### Supported Languages
 Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
 ---
@@ -193,6 +207,43 @@ npmci docker build
 npmci docker push code.foss.global
 ```
 ## Multi-Pass Extraction Strategy
 The bank statement extraction uses a dual-VLM consensus approach:
 ### Architecture: Dual-VLM Consensus
 | VLM | Model | Purpose |
 |-----|-------|---------|
 | **MiniCPM-V 4.5** | 8B params | Primary visual extraction |
 | **PaddleOCR-VL** | 0.9B params | Table-specialized extraction |
 ### Extraction Strategy
 1. **Pass 1**: MiniCPM-V visual extraction (images → JSON)
 2. **Pass 2**: PaddleOCR-VL table recognition (images → markdown → JSON)
 3. **Consensus**: If Pass 1 == Pass 2 → Done (fast path)
 4. **Pass 3+**: MiniCPM-V visual if no consensus
 ### Why Dual-VLM Works
 - **Different architectures**: Two independent models cross-check each other
 - **Specialized strengths**: PaddleOCR-VL optimized for tables, MiniCPM-V for general vision
 - **No structure loss**: Both VLMs see the original images directly
 - **Fast consensus**: Most documents complete in 2 passes when VLMs agree
 ### Comparison vs Old PP-Structure Approach
 | Approach | Bank Statement Result | Issue |
 |----------|----------------------|-------|
 | MiniCPM-V Visual | 28 transactions ✓ | - |
 | PP-Structure HTML + Visual | 13 transactions ✗ | HTML merged rows incorrectly |
 | PaddleOCR-VL Table | 28 transactions ✓ | Native table understanding |
 **Key insight**: PP-Structure's HTML output loses structure for complex tables. PaddleOCR-VL's native VLM approach maintains table integrity.
 ---
 ## Related Resources
 - [Ollama Documentation](https://ollama.ai/docs)
--- a/recipes/document.md
+++ b/recipes/document.md
@@ -1,129 +1,250 @@
-# Bank Statement Parsing with MiniCPM-V 4.5
+# Document Recognition with Hybrid OCR + Vision AI
-Recipe for extracting transactions from bank statement PDFs using vision-language AI.
+Recipe for extracting structured data from invoices and documents using a hybrid approach:
 PaddleOCR for text extraction + MiniCPM-V 4.5 for intelligent parsing.
-## Model
+## Architecture
- **Model**: MiniCPM-V 4.5 (8B parameters)
+```
- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0`
+┌──────────────┐      ┌──────────────┐      ┌──────────────┐
- **Quantization**: Q8_0 (9.8GB VRAM)
+│  PDF/Image   │ ───> │  PaddleOCR   │ ───> │   Raw Text   │
- **Runtime**: Ollama on GPU
+└──────────────┘      └──────────────┘      └──────┬───────┘
                                                   │
                      ┌──────────────┐             │
                      │  MiniCPM-V   │ <───────────┘
                      │   4.5 VLM    │ <─── Image
                      └──────┬───────┘
                             │
                      ┌──────▼───────┐
                      │ Structured   │
                      │    JSON      │
                      └──────────────┘
 ```
 ## Why Hybrid?
 | Approach | Accuracy | Speed | Best For |
 |----------|----------|-------|----------|
 | VLM Only | 85-90% | Fast | Simple layouts |
 | OCR Only | N/A | Fast | Just text extraction |
 | **Hybrid** | **91%+** | Medium | Complex invoices |
 The hybrid approach provides OCR text as context to the VLM, improving accuracy on:
 - Small text and numbers
 - Low contrast documents
 - Dense tables
 ## Services
 | Service | Port | Purpose |
 |---------|------|---------|
 | PaddleOCR | 5000 | Text extraction |
 | Ollama (MiniCPM-V) | 11434 | Intelligent parsing |
 ## Running the Containers
 **Start both services:**
 ```bash
 # PaddleOCR (CPU is sufficient for OCR)
 docker run -d --name paddleocr -p 5000:5000 \
  code.foss.global/host.today/ht-docker-ai:paddleocr-cpu
 # MiniCPM-V 4.5 (GPU recommended)
 docker run -d --name minicpm --gpus all -p 11434:11434 \
  -v ollama-data:/root/.ollama \
  code.foss.global/host.today/ht-docker-ai:minicpm45v
 ```
 ## Image Conversion
-Convert PDF to PNG at 300 DPI for optimal OCR accuracy.
+Convert PDF to PNG at 200 DPI:
 ```bash
-convert -density 300 -quality 100 input.pdf \
+convert -density 200 -quality 90 input.pdf \
  -background white -alpha remove \
-  output-%d.png
+  page-%d.png
 ```
-**Parameters:**
+## Step 1: Extract OCR Text
 - `-density 300`: 300 DPI resolution (critical for accuracy)
 - `-quality 100`: Maximum quality
 - `-background white -alpha remove`: Remove transparency
 - `output-%d.png`: Outputs page-0.png, page-1.png, etc.
-**Dependencies:**
+```typescript
-```bash
+async function extractOcrText(imageBase64: string): Promise<string> {
-apt-get install imagemagick
+  const response = await fetch('http://localhost:5000/ocr', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: imageBase64 }),
  });
  const data = await response.json();
  if (data.success && data.results) {
    return data.results.map((r: { text: string }) => r.text).join('\n');
  }
  return '';
 }
 ```
-## Prompt
+## Step 2: Build Enhanced Prompt
-```
+```typescript
-You are a bank statement parser. Extract EVERY transaction from the table.
+function buildPrompt(ocrText: string): string {
  const base = `You are an invoice parser. Extract the following fields:
-Read the Amount column carefully:
+1. invoice_number: The invoice/receipt number
- "- 21,47 €" means DEBIT, output as: -21.47
+2. invoice_date: Date in YYYY-MM-DD format
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
+3. vendor_name: Company that issued the invoice
- European format: comma = decimal point
+4. currency: EUR, USD, etc.
 5. net_amount: Amount before tax (if shown)
 6. vat_amount: Tax/VAT amount (0 if reverse charge)
 7. total_amount: Final amount due
-For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
+Return ONLY valid JSON:
 {"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00}`;
-Do not skip any rows. Return complete JSON array:
+  if (ocrText) {
    return `${base}
 OCR text extracted from the invoice:
 ---
 ${ocrText}
 ---
 Cross-reference the image with the OCR text above for accuracy.`;
  }
  return base;
 }
 ```
-## API Call
+## Step 3: Call Vision-Language Model
-```python
+```typescript
-import base64
+async function extractInvoice(images: string[], ocrText: string): Promise<Invoice> {
-import requests
+  const payload = {
    model: 'openbmb/minicpm-v4.5:q8_0',
    prompt: buildPrompt(ocrText),
    images,  // Base64 encoded
    stream: false,
    options: {
      num_predict: 2048,
      temperature: 0.1,
    },
  };
-# Load images
+  const response = await fetch('http://localhost:11434/api/generate', {
-with open('page-0.png', 'rb') as f:
+    method: 'POST',
-    page0 = base64.b64encode(f.read()).decode('utf-8')
+    headers: { 'Content-Type': 'application/json' },
-with open('page-1.png', 'rb') as f:
+    body: JSON.stringify(payload),
-    page1 = base64.b64encode(f.read()).decode('utf-8')
+  });
-payload = {
+  const result = await response.json();
-    "model": "openbmb/minicpm-v4.5:q8_0",
+  return JSON.parse(result.response);
-    "prompt": prompt,
+}
-    "images": [page0, page1],  # Multiple pages supported
+```
-    "stream": False,
+
-    "options": {
+## Consensus Voting
-        "num_predict": 16384,
+
-        "temperature": 0.1
+For production reliability, run multiple extraction passes and require consensus:
 ```typescript
 async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<Invoice> {
  const results: Map<string, { invoice: Invoice; count: number }> = new Map();
  // Optimization: Run Pass 1 (no OCR) parallel with OCR + Pass 2
  const [pass1Result, ocrText] = await Promise.all([
    extractInvoice(images, ''),
    extractOcrText(images[0]),
  ]);
  // Add Pass 1 result
  addResult(results, pass1Result);
  // Pass 2 with OCR context
  const pass2Result = await extractInvoice(images, ocrText);
  addResult(results, pass2Result);
  // Check for consensus (2 matching results)
  for (const [hash, data] of results) {
    if (data.count >= 2) {
      return data.invoice;  // Consensus reached!
    }
  }
  // Continue until consensus or max passes
  for (let pass = 3; pass <= maxPasses; pass++) {
    const result = await extractInvoice(images, ocrText);
    addResult(results, result);
    // Check consensus...
  }
  // Return most common result
  return getMostCommon(results);
 }
-response = requests.post(
+function hashInvoice(inv: Invoice): string {
-    'http://localhost:11434/api/generate',
+  return `${inv.invoice_number}|${inv.invoice_date}|${inv.total_amount.toFixed(2)}`;
-    json=payload,
+}
    timeout=600
 )
 result = response.json()['response']
 ```
 ## Output Format
 ```json
-[
+{
-  {"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47},
+  "invoice_number": "INV-2024-001234",
-  {"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06},
+  "invoice_date": "2024-08-15",
-  {"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00}
+  "vendor_name": "Hetzner Online GmbH",
-]
+  "currency": "EUR",
  "net_amount": 167.52,
  "vat_amount": 31.83,
  "total_amount": 199.35
 }
 ```
 ## Running the Container
 **GPU (recommended):**
 ```bash
 docker run -d --gpus all -p 11434:11434 \
  -v ollama-data:/root/.ollama \
  -e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
  ht-docker-ai:minicpm45v
 ```
 **CPU (slower):**
 ```bash
 docker run -d -p 11434:11434 \
  -v ollama-data:/root/.ollama \
  -e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
  ht-docker-ai:minicpm45v-cpu
 ```
 ## Hardware Requirements
 | Quantization | VRAM/RAM | Speed |
 |--------------|----------|-------|
 | Q8_0 (GPU)   | 10GB     | Fast  |
 | Q4_0 (CPU)   | 8GB      | Slow  |
 ## Test Results
-| Statement | Pages | Transactions | Accuracy |
+Tested on 46 real invoices from various vendors:
-|-----------|-------|--------------|----------|
+
-| bunq-2022-04 | 2 | 26 | 100% |
+| Metric | Value |
-| bunq-2021-06 | 3 | 28 | 100% |
+|--------|-------|
 | **Accuracy** | 91.3% (42/46) |
 | **Avg Time** | 42.7s per invoice |
 | **Consensus Rate** | 85% in 2 passes |
 ### Per-Vendor Results
 | Vendor | Invoices | Accuracy |
 |--------|----------|----------|
 | Hetzner | 3 | 100% |
 | DigitalOcean | 4 | 100% |
 | Adobe | 3 | 100% |
 | Cloudflare | 1 | 100% |
 | Wasabi | 4 | 100% |
 | Figma | 3 | 100% |
 | Google Cloud | 1 | 100% |
 | MongoDB | 3 | 0% (date parsing) |
 ## Hardware Requirements
 | Component | Minimum | Recommended |
 |-----------|---------|-------------|
 | PaddleOCR (CPU) | 4GB RAM | 8GB RAM |
 | MiniCPM-V (GPU) | 10GB VRAM | 12GB VRAM |
 | MiniCPM-V (CPU) | 16GB RAM | 32GB RAM |
 ## Tips
-1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal
+1. **Use hybrid approach**: OCR text dramatically improves number/date accuracy
-2. **PNG over JPEG**: PNG preserves text clarity better
+2. **Consensus voting**: Run 2-5 passes to catch hallucinations
-3. **Remove alpha**: Some models struggle with transparency
+3. **200 DPI is optimal**: Higher doesn't help, lower loses detail
-4. **Multi-page**: Pass all pages in single request for context
+4. **PNG over JPEG**: Preserves text clarity
 5. **Temperature 0.1**: Low temperature for consistent output
-6. **European format**: Explicitly explain comma=decimal in prompt
+6. **Multi-page support**: Pass all pages in single request for context
 7. **Normalize for comparison**: Ignore case/whitespace when comparing invoice numbers
 ## Common Issues
 | Issue | Cause | Solution |
 |-------|-------|----------|
 | Wrong date | Multiple dates on invoice | Be specific in prompt about which date |
 | Wrong currency | Symbol vs code mismatch | OCR helps disambiguate |
 | Missing digits | Low resolution | Increase density to 300 DPI |
 | Hallucinated data | VLM uncertainty | Use consensus voting |
--- a/test/test.bankstatements.combined.ts
+++ b/test/test.bankstatements.combined.ts
@@ -0,0 +1,535 @@
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 // Service URLs
 const OLLAMA_URL = 'http://localhost:11434';
 const PADDLEOCR_VL_URL = 'http://localhost:8000';
 // Models
 const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
 const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
 // Prompt for MiniCPM-V visual extraction
 const MINICPM_EXTRACT_PROMPT = `/nothink
 You are a bank statement parser. Extract EVERY transaction from the table.
 Read the Amount column carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
 - "+ 1.000,00 €" means CREDIT, output as: 1000.00
 - European format: comma = decimal point
 For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Do not skip any rows. Return ONLY the JSON array, no explanation.`;
 // Prompt for PaddleOCR-VL table extraction
 const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
 // Post-processing prompt to convert PaddleOCR-VL output to JSON
 const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
 Convert the following bank statement table data to JSON.
 Read the Amount values carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
 - "+ 1.000,00 €" means CREDIT, output as: 1000.00
 - European format: comma = decimal point
 For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Return ONLY the JSON array, no explanation.
 Table data:
 ---
 {TABLE_DATA}
 ---`;
 interface ITransaction {
  date: string;
  counterparty: string;
  amount: number;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Extract using MiniCPM-V via Ollama
 */
 async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
  const payload = {
    model: MINICPM_MODEL,
    prompt: MINICPM_EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  let lineBuffer = '';
  console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
          lineBuffer += json.response;
          if (lineBuffer.includes('\n')) {
            const parts = lineBuffer.split('\n');
            for (let i = 0; i < parts.length - 1; i++) {
              console.log(parts[i]);
            }
            lineBuffer = parts[parts.length - 1];
          }
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  if (lineBuffer) {
    console.log(lineBuffer);
  }
  console.log('');
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error('No JSON array found in response');
  }
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Extract table using PaddleOCR-VL via OpenAI-compatible API
 */
 async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
  const payload = {
    model: PADDLEOCR_VL_MODEL,
    messages: [
      {
        role: 'user',
        content: [
          {
            type: 'image_url',
            image_url: { url: `data:image/png;base64,${imageBase64}` },
          },
          {
            type: 'text',
            text: PADDLEOCR_VL_TABLE_PROMPT,
          },
        ],
      },
    ],
    temperature: 0.0,
    max_tokens: 8192,
  };
  const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    const text = await response.text();
    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
  }
  const data = await response.json();
  return data.choices?.[0]?.message?.content || '';
 }
 /**
 * Convert PaddleOCR-VL table output to transactions using MiniCPM-V
 */
 async function convertTableToTransactions(
  tableData: string,
  passLabel: string
 ): Promise<ITransaction[]> {
  const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
  const payload = {
    model: MINICPM_MODEL,
    prompt,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  console.log(`[${passLabel}] Converting table data to JSON...`);
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error('No JSON array found in response');
  }
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Extract using PaddleOCR-VL (table recognition) + conversion
 */
 async function extractWithPaddleOCRVL(
  images: string[],
  passLabel: string
 ): Promise<ITransaction[]> {
  console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
  // Extract table data from each page
  const tableDataParts: string[] = [];
  for (let i = 0; i < images.length; i++) {
    console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
    const tableData = await extractTableWithPaddleOCRVL(images[i]);
    if (tableData.trim()) {
      tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
    }
  }
  const combinedTableData = tableDataParts.join('\n\n');
  console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
  // Convert to transactions
  return convertTableToTransactions(combinedTableData, passLabel);
 }
 /**
 * Create a hash of transactions for comparison
 */
 function hashTransactions(transactions: ITransaction[]): string {
  return transactions
    .map((t) => `${t.date}|${t.amount.toFixed(2)}`)
    .sort()
    .join(';');
 }
 /**
 * Check if PaddleOCR-VL service is available
 */
 async function isPaddleOCRVLAvailable(): Promise<boolean> {
  try {
    const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
      method: 'GET',
      signal: AbortSignal.timeout(5000),
    });
    return response.ok;
  } catch {
    return false;
  }
 }
 /**
 * Extract with dual-VLM consensus
 * Strategy:
 *   Pass 1 = MiniCPM-V visual extraction
 *   Pass 2 = PaddleOCR-VL table recognition (if available)
 *   Pass 3+ = MiniCPM-V visual (fallback)
 */
 async function extractWithConsensus(
  images: string[],
  maxPasses: number = 5
 ): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
  const addResult = (transactions: ITransaction[], passLabel: string): number => {
    const hash = hashTransactions(transactions);
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
    console.log(
      `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
    );
    return hashCounts.get(hash)!;
  };
  // Check if PaddleOCR-VL is available
  const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
  if (paddleOCRVLAvailable) {
    console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
  } else {
    console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
  }
  // Pass 1: MiniCPM-V visual extraction
  try {
    const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
    addResult(pass1Result, 'Pass 1 MiniCPM-V');
  } catch (err) {
    console.log(`[Pass 1] Error: ${err}`);
  }
  // Pass 2: PaddleOCR-VL table recognition (if available)
  if (paddleOCRVLAvailable) {
    try {
      const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
      const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
      if (count >= 2) {
        console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
        return pass2Result;
      }
    } catch (err) {
      console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
    }
  }
  // Pass 3+: Continue with MiniCPM-V visual passes
  const startPass = paddleOCRVLAvailable ? 3 : 2;
  for (let pass = startPass; pass <= maxPasses; pass++) {
    try {
      const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
      const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
      if (count >= 2) {
        console.log(`[Consensus] Reached after ${pass} passes`);
        return transactions;
      }
      console.log(`[Pass ${pass}] No consensus yet, trying again...`);
    } catch (err) {
      console.log(`[Pass ${pass}] Error: ${err}`);
    }
  }
  // No consensus reached - return the most common result
  let bestHash = '';
  let bestCount = 0;
  for (const [hash, count] of hashCounts) {
    if (count > bestCount) {
      bestCount = count;
      bestHash = hash;
    }
  }
  if (!bestHash) {
    throw new Error('No valid results obtained');
  }
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.transactions;
 }
 /**
 * Compare extracted transactions against expected
 */
 function compareTransactions(
  extracted: ITransaction[],
  expected: ITransaction[]
 ): { matches: number; total: number; errors: string[] } {
  const errors: string[] = [];
  let matches = 0;
  for (let i = 0; i < expected.length; i++) {
    const exp = expected[i];
    const ext = extracted[i];
    if (!ext) {
      errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
      continue;
    }
    const dateMatch = ext.date === exp.date;
    const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
    if (dateMatch && amountMatch) {
      matches++;
    } else {
      errors.push(
        `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
      );
    }
  }
  if (extracted.length > expected.length) {
    errors.push(`Extra transactions: ${extracted.length - expected.length}`);
  }
  return { matches, total: expected.length, errors };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  return testCases;
 }
 // Tests
 tap.test('should connect to Ollama API', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  expect(response.ok).toBeTrue();
  const data = await response.json();
  expect(data.models).toBeArray();
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  const data = await response.json();
  const modelNames = data.models.map((m: { name: string }) => m.name);
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 tap.test('should check PaddleOCR-VL availability', async () => {
  const available = await isPaddleOCRVLAvailable();
  console.log(`PaddleOCR-VL available: ${available}`);
  // This test passes regardless - PaddleOCR-VL is optional
  expect(true).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 for (const testCase of testCases) {
  tap.test(`should extract transactions from ${testCase.name}`, async () => {
    // Load expected transactions
    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.length} transactions`);
    // Convert PDF to images
    console.log('Converting PDF to images...');
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`Converted: ${images.length} pages\n`);
    // Extract with dual-VLM consensus
    const extracted = await extractWithConsensus(images);
    console.log(`\nFinal: ${extracted.length} transactions`);
    // Compare results
    const result = compareTransactions(extracted, expected);
    console.log(`Accuracy: ${result.matches}/${result.total}`);
    if (result.errors.length > 0) {
      console.log('Errors:');
      result.errors.forEach((e) => console.log(`  - ${e}`));
    }
    // Assert high accuracy
    const accuracy = result.matches / result.total;
    expect(accuracy).toBeGreaterThan(0.95);
    expect(extracted.length).toEqual(expected.length);
  });
 }
 export default tap.start();
--- a/test/test.invoices.combined.ts
+++ b/test/test.invoices.combined.ts
@@ -6,7 +6,7 @@ import * as os from 'os';
 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';
 interface IInvoice {
  invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
 }
 /**
- * Extract OCR text from an image using PaddleOCR
+ * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
 */
 async function extractOcrText(imageBase64: string): Promise<string> {
  try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
+      body: JSON.stringify({
        model: 'paddleocr-vl',
        messages: [{
          role: 'user',
          content: [
            { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
            { type: 'text', text: 'OCR:' }
          ]
        }],
        temperature: 0.0,
        max_tokens: 4096
      }),
    });
    if (!response.ok) return '';
    const data = await response.json();
-    if (data.success && data.results) {
+    return data.choices?.[0]?.message?.content || '';
      return data.results.map((r: { text: string }) => r.text).join('\n');
    }
  } catch {
-    // PaddleOCR unavailable
+    // PaddleOCR-VL unavailable
  }
  return '';
 }
@@ -45,7 +54,8 @@ async function extractOcrText(imageBase64: string): Promise<string> {
 * Build prompt with optional OCR text
 */
 function buildPrompt(ocrText: string): string {
-  const base = `You are an invoice parser. Extract the following fields from this invoice:
+  const base = `/nothink
 You are an invoice parser. Extract the following fields from this invoice:
 1. invoice_number: The invoice/receipt number
 2. invoice_date: Date in YYYY-MM-DD format
@@ -62,11 +72,17 @@ If a field is not visible, use null for strings or 0 for numbers.
 No explanation, just the JSON object.`;
  if (ocrText) {
    // Limit OCR text to prevent context overflow
    const maxOcrLength = 4000;
    const truncatedOcr = ocrText.length > maxOcrLength
      ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
      : ocrText;
    return `${base}
-OCR text extracted from the invoice:
+OCR text extracted from the invoice (use for reference):
 ---
-${ocrText}
+${truncatedOcr}
 ---
 Cross-reference the image with the OCR text above for accuracy.`;
--- a/test/test.node.ts
+++ b/test/test.node.ts
@@ -1,305 +0,0 @@
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
 const EXTRACT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
 Read the Amount column carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
 - "+ 1.000,00 €" means CREDIT, output as: 1000.00
 - European format: comma = decimal point
 For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Do not skip any rows. Return ONLY the JSON array, no explanation.`;
 interface ITransaction {
  date: string;
  counterparty: string;
  amount: number;
 }
 /**
 * Convert PDF to PNG images using ImageMagick
 */
 function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');
  try {
    execSync(
      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );
    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }
    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Single extraction pass
 */
 async function extractOnce(images: string[], passNum: number): Promise<ITransaction[]> {
  const payload = {
    model: MODEL,
    prompt: EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  let lineBuffer = '';
  console.log(`[Pass ${passNum}] Extracting...`);
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
          lineBuffer += json.response;
          // Print complete lines
          if (lineBuffer.includes('\n')) {
            const parts = lineBuffer.split('\n');
            for (let i = 0; i < parts.length - 1; i++) {
              console.log(parts[i]);
            }
            lineBuffer = parts[parts.length - 1];
          }
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  if (lineBuffer) {
    console.log(lineBuffer);
  }
  console.log('');
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error('No JSON array found in response');
  }
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Create a hash of transactions for comparison
 */
 function hashTransactions(transactions: ITransaction[]): string {
  return transactions
    .map((t) => `${t.date}|${t.amount.toFixed(2)}`)
    .sort()
    .join(';');
 }
 /**
 * Extract with majority voting - run until 2 passes match
 */
 async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
  for (let pass = 1; pass <= maxPasses; pass++) {
    const transactions = await extractOnce(images, pass);
    const hash = hashTransactions(transactions);
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
    console.log(`[Pass ${pass}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
    // Check if we have consensus (2+ matching)
    const count = hashCounts.get(hash)!;
    if (count >= 2) {
      console.log(`[Consensus] Reached after ${pass} passes (${count} matching results)`);
      return transactions;
    }
    // After 2 passes, if no match yet, continue
    if (pass >= 2) {
      console.log(`[Pass ${pass}] No consensus yet, trying again...`);
    }
  }
  // No consensus reached - return the most common result
  let bestHash = '';
  let bestCount = 0;
  for (const [hash, count] of hashCounts) {
    if (count > bestCount) {
      bestCount = count;
      bestHash = hash;
    }
  }
  const best = results.find((r) => r.hash === bestHash)!;
  console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
  return best.transactions;
 }
 /**
 * Compare extracted transactions against expected
 */
 function compareTransactions(
  extracted: ITransaction[],
  expected: ITransaction[]
 ): { matches: number; total: number; errors: string[] } {
  const errors: string[] = [];
  let matches = 0;
  for (let i = 0; i < expected.length; i++) {
    const exp = expected[i];
    const ext = extracted[i];
    if (!ext) {
      errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
      continue;
    }
    const dateMatch = ext.date === exp.date;
    const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
    if (dateMatch && amountMatch) {
      matches++;
    } else {
      errors.push(
        `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
      );
    }
  }
  if (extracted.length > expected.length) {
    errors.push(`Extra transactions: ${extracted.length - expected.length}`);
  }
  return { matches, total: expected.length, errors };
 }
 /**
 * Find all test cases (PDF + JSON pairs) in .nogit/
 */
 function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit');
  if (!fs.existsSync(testDir)) {
    return [];
  }
  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }
  return testCases;
 }
 // Tests
 tap.test('should connect to Ollama API', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  expect(response.ok).toBeTrue();
  const data = await response.json();
  expect(data.models).toBeArray();
 });
 tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  const data = await response.json();
  const modelNames = data.models.map((m: { name: string }) => m.name);
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 for (const testCase of testCases) {
  tap.test(`should extract transactions from ${testCase.name}`, async () => {
    // Load expected transactions
    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.length} transactions`);
    // Convert PDF to images
    console.log('Converting PDF to images...');
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`Converted: ${images.length} pages\n`);
    // Extract with consensus voting
    const extracted = await extractWithConsensus(images);
    console.log(`\nFinal: ${extracted.length} transactions`);
    // Compare results
    const result = compareTransactions(extracted, expected);
    console.log(`Accuracy: ${result.matches}/${result.total}`);
    if (result.errors.length > 0) {
      console.log('Errors:');
      result.errors.forEach((e) => console.log(`  - ${e}`));
    }
    // Assert high accuracy
    const accuracy = result.matches / result.total;
    expect(accuracy).toBeGreaterThan(0.95);
    expect(extracted.length).toEqual(expected.length);
  });
 }
 export default tap.start();
--- a/test/test.paddleocr.ts
+++ b/test/test.paddleocr.ts
@@ -1,258 +0,0 @@
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 const PADDLEOCR_URL = 'http://localhost:5000';
 interface IOCRResult {
  text: string;
  confidence: number;
  box: number[][];
 }
 interface IOCRResponse {
  success: boolean;
  results: IOCRResult[];
  error?: string;
 }
 interface IHealthResponse {
  status: string;
  model: string;
  language: string;
  gpu_enabled: boolean;
 }
 /**
 * Convert PDF first page to PNG using ImageMagick
 */
 function convertPdfToImage(pdfPath: string): string {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPath = path.join(tempDir, 'page.png');
  try {
    execSync(
      `convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
      { stdio: 'pipe' }
    );
    const imageData = fs.readFileSync(outputPath);
    return imageData.toString('base64');
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Create a simple test image with text using ImageMagick
 */
 function createTestImage(text: string): string {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
  const outputPath = path.join(tempDir, 'test.png');
  try {
    execSync(
      `convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
      { stdio: 'pipe' }
    );
    const imageData = fs.readFileSync(outputPath);
    return imageData.toString('base64');
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 // Health check test
 tap.test('should respond to health check', async () => {
  const response = await fetch(`${PADDLEOCR_URL}/health`);
  expect(response.ok).toBeTrue();
  const data: IHealthResponse = await response.json();
  expect(data.status).toEqual('healthy');
  expect(data.model).toEqual('PP-OCRv4');
  expect(data.language).toBeTypeofString();
  expect(data.gpu_enabled).toBeTypeofBoolean();
  console.log(`PaddleOCR Status: ${data.status}`);
  console.log(`  Model: ${data.model}`);
  console.log(`  Language: ${data.language}`);
  console.log(`  GPU Enabled: ${data.gpu_enabled}`);
 });
 // Base64 OCR test
 tap.test('should perform OCR on base64 image', async () => {
  // Create a test image with known text
  const testText = 'Hello World 12345';
  console.log(`Creating test image with text: "${testText}"`);
  const imageBase64 = createTestImage(testText);
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: imageBase64 }),
  });
  expect(response.ok).toBeTrue();
  const data: IOCRResponse = await response.json();
  expect(data.success).toBeTrue();
  expect(data.results).toBeArray();
  const extractedText = data.results.map((r) => r.text).join(' ');
  console.log(`Extracted text: "${extractedText}"`);
  // Check that we got some text back
  expect(data.results.length).toBeGreaterThan(0);
  // Check that at least some of the expected text was found
  const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
  const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
  const hasPartialMatch =
    normalizedExtracted.includes('hello') ||
    normalizedExtracted.includes('world') ||
    normalizedExtracted.includes('12345');
  expect(hasPartialMatch).toBeTrue();
 });
 // File upload OCR test
 tap.test('should perform OCR via file upload', async () => {
  const testText = 'Invoice Number 98765';
  console.log(`Creating test image with text: "${testText}"`);
  const imageBase64 = createTestImage(testText);
  const imageBuffer = Buffer.from(imageBase64, 'base64');
  const formData = new FormData();
  const blob = new Blob([imageBuffer], { type: 'image/png' });
  formData.append('img', blob, 'test.png');
  const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
    method: 'POST',
    body: formData,
  });
  expect(response.ok).toBeTrue();
  const data: IOCRResponse = await response.json();
  expect(data.success).toBeTrue();
  expect(data.results).toBeArray();
  const extractedText = data.results.map((r) => r.text).join(' ');
  console.log(`Extracted text: "${extractedText}"`);
  // Check that we got some text back
  expect(data.results.length).toBeGreaterThan(0);
 });
 // OCR result structure test
 tap.test('should return proper OCR result structure', async () => {
  const testText = 'Test 123';
  const imageBase64 = createTestImage(testText);
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: imageBase64 }),
  });
  const data: IOCRResponse = await response.json();
  if (data.results.length > 0) {
    const result = data.results[0];
    // Check result has required fields
    expect(result.text).toBeTypeofString();
    expect(result.confidence).toBeTypeofNumber();
    expect(result.box).toBeArray();
    // Check bounding box structure (4 points, each with x,y)
    expect(result.box.length).toEqual(4);
    for (const point of result.box) {
      expect(point.length).toEqual(2);
      expect(point[0]).toBeTypeofNumber();
      expect(point[1]).toBeTypeofNumber();
    }
    // Confidence should be between 0 and 1
    expect(result.confidence).toBeGreaterThan(0);
    expect(result.confidence).toBeLessThanOrEqual(1);
    console.log(`Result structure valid:`);
    console.log(`  Text: "${result.text}"`);
    console.log(`  Confidence: ${(result.confidence * 100).toFixed(1)}%`);
    console.log(`  Box: ${JSON.stringify(result.box)}`);
  }
 });
 // Test with actual invoice if available
 const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
 if (fs.existsSync(invoiceDir)) {
  const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
  if (pdfFiles.length > 0) {
    const testPdf = pdfFiles[0];
    tap.test(`should extract text from invoice: ${testPdf}`, async () => {
      const pdfPath = path.join(invoiceDir, testPdf);
      console.log(`Converting ${testPdf} to image...`);
      const imageBase64 = convertPdfToImage(pdfPath);
      console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
      const startTime = Date.now();
      const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ image: imageBase64 }),
      });
      const endTime = Date.now();
      const elapsedMs = endTime - startTime;
      expect(response.ok).toBeTrue();
      const data: IOCRResponse = await response.json();
      expect(data.success).toBeTrue();
      console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
      console.log(`Found ${data.results.length} text regions`);
      // Print first 10 results
      const preview = data.results.slice(0, 10);
      console.log(`\nFirst ${preview.length} results:`);
      for (const result of preview) {
        console.log(`  [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
      }
      if (data.results.length > 10) {
        console.log(`  ... and ${data.results.length - 10} more`);
      }
      // Should find text in an invoice
      expect(data.results.length).toBeGreaterThan(5);
    });
  }
 }
 // Error handling test
 tap.test('should handle invalid base64 gracefully', async () => {
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: 'not-valid-base64!!!' }),
  });
  const data: IOCRResponse = await response.json();
  // Should return success: false with error message
  expect(data.success).toBeFalse();
  expect(data.error).toBeTypeofString();
  console.log(`Error handling works: ${data.error}`);
 });
 export default tap.start();
Author	SHA1	Message	Date
Juergen Kunz	addae20cbd	v1.5.0 Some checks failed Docker (tags) / security (push) Successful in 31s Details Docker (tags) / test (push) Failing after 40s Details Docker (tags) / release (push) Has been skipped Details Docker (tags) / metadata (push) Has been skipped Details	2026-01-17 16:57:26 +00:00
Juergen Kunz	0482c35b69	feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests	2026-01-17 16:57:26 +00:00
Juergen Kunz	15ac1fcf67	update	2026-01-16 16:21:44 +00:00
Juergen Kunz	3c5cf578a5	v1.4.0 Some checks failed Docker (tags) / security (push) Successful in 28s Details Docker (tags) / test (push) Failing after 54s Details Docker (tags) / release (push) Has been skipped Details Docker (tags) / metadata (push) Has been skipped Details	2026-01-16 14:24:37 +00:00
Juergen Kunz	82358b2d5d	feat(invoices): add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors	2026-01-16 14:24:37 +00:00