v1.5.0

feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
update
2026-01-17 16:57:26 +00:00 · 2026-01-17 16:57:26 +00:00 · 2026-01-16 16:21:44 +00:00
18 changed files with 999 additions and 817 deletions
--- a/49
+++ b/49
@@ -1,49 +0,0 @@
 # PaddleOCR GPU Variant
 # OCR processing with NVIDIA GPU support using PaddlePaddle
 FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV OCR_LANGUAGE="en"
 ENV SERVER_PORT="5000"
 ENV SERVER_HOST="0.0.0.0"
 ENV PYTHONUNBUFFERED=1
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies (using stable paddleocr 2.x)
 RUN pip install --no-cache-dir \
    paddleocr==2.8.1 \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    opencv-python-headless \
    pillow
 # Copy server files
 COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
 COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
 # Note: OCR models will be downloaded on first run
 # This ensures compatibility across different GPU architectures
 # Expose API port
 EXPOSE 5000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:5000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
--- a/53
+++ b/53
@@ -1,53 +0,0 @@
 # PaddleOCR CPU Variant
 # OCR processing optimized for CPU-only inference
 FROM python:3.10-slim-bookworm
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration for CPU-only mode
 ENV OCR_LANGUAGE="en"
 ENV SERVER_PORT="5000"
 ENV SERVER_HOST="0.0.0.0"
 ENV PYTHONUNBUFFERED=1
 # Disable GPU usage for CPU-only variant
 ENV CUDA_VISIBLE_DEVICES="-1"
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
 RUN pip install --no-cache-dir \
    paddlepaddle==2.6.2 \
    paddleocr==2.8.1 \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    opencv-python-headless \
    pillow
 # Copy server files
 COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
 COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
 # Note: OCR models will be downloaded on first run
 # This avoids build-time segfaults with certain CPU architectures
 # Expose API port
 EXPOSE 5000
 # Health check (longer start-period for CPU variant)
 HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD curl -f http://localhost:5000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
--- a/70
+++ b/70
@@ -0,0 +1,70 @@
 # PaddleOCR-VL GPU Variant
 # Vision-Language Model for document parsing using vLLM
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    git \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Create and activate virtual environment
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Install PyTorch with CUDA support
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124
 # Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
 RUN pip install --no-cache-dir \
    vllm==0.11.1 \
    --extra-index-url https://download.pytorch.org/whl/cu124
 # Install additional dependencies
 RUN pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    openai \
    httpx
 # Copy entrypoint script
 COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
 # Expose vLLM API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
 # PaddleOCR-VL CPU Variant
 # Vision-Language Model for document parsing using transformers (slower, no GPU required)
 FROM python:3.11-slim-bookworm
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV CUDA_VISIBLE_DEVICES=""
 ENV SERVER_PORT=8000
 ENV SERVER_HOST=0.0.0.0
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    git \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
    pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    httpx \
    protobuf \
    sentencepiece \
    einops
 # Copy server files
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
 COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
 # Expose API port
 EXPOSE 8000
 # Health check (longer start-period for CPU + model download)
 HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"]
--- a/71
+++ b/71
@@ -0,0 +1,71 @@
 # PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
 # Vision-Language Model for document parsing using transformers with CUDA
 FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
 LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
 # Environment configuration
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/root/.cache/huggingface
 ENV SERVER_PORT=8000
 ENV SERVER_HOST=0.0.0.0
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libgomp1 \
    curl \
    git \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Create and activate virtual environment
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Install PyTorch with CUDA support
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
    torch==2.5.1 \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124
 # Install Python dependencies (transformers-based, not vLLM)
 RUN pip install --no-cache-dir \
    transformers \
    accelerate \
    safetensors \
    pillow \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    httpx \
    protobuf \
    sentencepiece \
    einops
 # Copy server files (same as CPU variant - it auto-detects CUDA)
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
 COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
 # Expose API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
--- a/build-images.sh
+++ b/build-images.sh
@@ -29,19 +29,19 @@ docker build \
    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
    .
-# Build PaddleOCR GPU variant
+# Build PaddleOCR-VL GPU variant (vLLM)
-echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}"
+echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
 docker build \
-    -f Dockerfile_paddleocr \
+    -f Dockerfile_paddleocr_vl \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
    .
-# Build PaddleOCR CPU variant
+# Build PaddleOCR-VL CPU variant
-echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}"
+echo -e "${GREEN}Building PaddleOCR-VL CPU variant...${NC}"
 docker build \
-    -f Dockerfile_paddleocr_cpu \
+    -f Dockerfile_paddleocr_vl_cpu \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu \
    .
 echo -e "${GREEN}All images built successfully!${NC}"
@@ -52,7 +52,7 @@ echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
 echo ""
-echo "  PaddleOCR:"
+echo "  PaddleOCR-VL (Vision-Language Model):"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl (GPU/vLLM)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu (GPU/vLLM)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu (CPU)"
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,15 @@
 # Changelog
 ## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
 add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
 - Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
 - Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
 - Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
 - Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
 - Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
 - Add @types/node to package.json dependencies and tidy devDependencies ordering.
 ## 2026-01-16 - 1.4.0 - feat(invoices)
 add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors
--- a/image_support_files/paddleocr-entrypoint.sh
+++ b/image_support_files/paddleocr-entrypoint.sh
@@ -1,25 +0,0 @@
 #!/bin/bash
 set -e
 # Configuration from environment
 OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
 SERVER_PORT="${SERVER_PORT:-5000}"
 SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
 echo "Starting PaddleOCR Server..."
 echo "  Language: ${OCR_LANGUAGE}"
 echo "  Host: ${SERVER_HOST}"
 echo "  Port: ${SERVER_PORT}"
 # Check GPU availability
 if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
    echo "  GPU: Disabled (CPU mode)"
 else
    echo "  GPU: Enabled"
 fi
 # Start the FastAPI server with uvicorn
 exec python -m uvicorn paddleocr_server:app \
    --host "${SERVER_HOST}" \
    --port "${SERVER_PORT}" \
    --workers 1
--- a/image_support_files/paddleocr-vl-cpu-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-cpu-entrypoint.sh
@@ -0,0 +1,19 @@
 #!/bin/bash
 set -e
 echo "==================================="
 echo "PaddleOCR-VL Server (CPU)"
 echo "==================================="
 HOST="${SERVER_HOST:-0.0.0.0}"
 PORT="${SERVER_PORT:-8000}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Device: CPU (no GPU)"
 echo ""
 echo "Starting PaddleOCR-VL CPU server..."
 echo "==================================="
 exec python /app/paddleocr_vl_server.py
--- a/image_support_files/paddleocr-vl-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -0,0 +1,59 @@
 #!/bin/bash
 set -e
 echo "==================================="
 echo "PaddleOCR-VL Server"
 echo "==================================="
 # Configuration
 MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
 HOST="${HOST:-0.0.0.0}"
 PORT="${PORT:-8000}"
 MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
 GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
 ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
 echo "Model: ${MODEL_NAME}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
 echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
 echo "Max model length: ${MAX_MODEL_LEN}"
 echo "Enforce eager: ${ENFORCE_EAGER}"
 echo ""
 # Check GPU availability
 if command -v nvidia-smi &> /dev/null; then
    echo "GPU Information:"
    nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
    echo ""
 else
    echo "WARNING: nvidia-smi not found. GPU may not be available."
 fi
 echo "Starting vLLM server..."
 echo "==================================="
 # Build vLLM command
 VLLM_ARGS=(
    serve "${MODEL_NAME}"
    --trust-remote-code
    --host "${HOST}"
    --port "${PORT}"
    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
    --max-model-len "${MAX_MODEL_LEN}"
    --no-enable-prefix-caching
    --mm-processor-cache-gb 0
    --served-model-name "paddleocr-vl"
    --limit-mm-per-prompt '{"image": 1}'
 )
 # Add enforce-eager if enabled (disables CUDA graphs, saves memory)
 if [ "${ENFORCE_EAGER}" = "true" ]; then
    VLLM_ARGS+=(--enforce-eager)
 fi
 # Start vLLM server with PaddleOCR-VL
 exec vllm "${VLLM_ARGS[@]}"
--- a/image_support_files/paddleocr_server.py
+++ b/image_support_files/paddleocr_server.py
@@ -1,253 +0,0 @@
 #!/usr/bin/env python3
 """
 PaddleOCR FastAPI Server
 Provides REST API for OCR operations using PaddleOCR
 """
 import os
 import io
 import base64
 import logging
 from typing import Optional, List, Any
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import numpy as np
 from PIL import Image
 from paddleocr import PaddleOCR
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Environment configuration
 OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
 # GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
 USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
 # Initialize FastAPI app
 app = FastAPI(
    title="PaddleOCR Server",
    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
    version="1.0.0"
 )
 # Global OCR instance
 ocr_instance: Optional[PaddleOCR] = None
 class OCRRequest(BaseModel):
    """Request model for base64 image OCR"""
    image: str
    language: Optional[str] = None
 class BoundingBox(BaseModel):
    """Bounding box for detected text"""
    points: List[List[float]]
 class OCRResult(BaseModel):
    """Single OCR detection result"""
    text: str
    confidence: float
    box: List[List[float]]
 class OCRResponse(BaseModel):
    """OCR response model"""
    success: bool
    results: List[OCRResult]
    error: Optional[str] = None
 class HealthResponse(BaseModel):
    """Health check response"""
    status: str
    model: str
    language: str
    gpu_enabled: bool
 def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
    """Get or initialize the OCR instance"""
    global ocr_instance
    use_lang = lang or OCR_LANGUAGE
    # Return cached instance if same language
    if ocr_instance is not None and lang is None:
        return ocr_instance
    logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
    new_ocr = PaddleOCR(
        use_angle_cls=True,
        lang=use_lang,
        use_gpu=USE_GPU,
        show_log=False
    )
    # Cache the default language instance
    if lang is None:
        ocr_instance = new_ocr
    logger.info("PaddleOCR initialized successfully")
    return new_ocr
 def decode_base64_image(base64_string: str) -> np.ndarray:
    """Decode base64 string to numpy array"""
    # Remove data URL prefix if present
    if ',' in base64_string:
        base64_string = base64_string.split(',')[1]
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    # Convert to RGB if necessary
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return np.array(image)
 def process_ocr_result(result: Any) -> List[OCRResult]:
    """Process PaddleOCR result into structured format"""
    results = []
    if result is None or len(result) == 0:
        return results
    # PaddleOCR returns list of results per image
    # Each result is a list of [box, (text, confidence)]
    for line in result[0] if result[0] else []:
        if line is None:
            continue
        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        text_info = line[1]  # (text, confidence)
        results.append(OCRResult(
            text=text_info[0],
            confidence=float(text_info[1]),
            box=[[float(p[0]), float(p[1])] for p in box]
        ))
    return results
@app.on_event("startup")
 async def startup_event():
    """Pre-warm the OCR model on startup"""
    logger.info("Pre-warming OCR model...")
    try:
        ocr = get_ocr()
        # Create a small test image to warm up the model
        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
        test_image.fill(255)  # White image
        ocr.ocr(test_image, cls=True)
        logger.info("OCR model pre-warmed successfully")
    except Exception as e:
        logger.error(f"Failed to pre-warm OCR model: {e}")
@app.get("/health", response_model=HealthResponse)
 async def health_check():
    """Health check endpoint"""
    try:
        # Ensure OCR is initialized
        get_ocr()
        return HealthResponse(
            status="healthy",
            model="PP-OCRv4",
            language=OCR_LANGUAGE,
            gpu_enabled=USE_GPU
        )
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        raise HTTPException(status_code=503, detail=str(e))
@app.post("/ocr", response_model=OCRResponse)
 async def ocr_base64(request: OCRRequest):
    """
    Perform OCR on a base64-encoded image
    Args:
        request: OCRRequest with base64 image and optional language
    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Decode image
        image = decode_base64_image(request.image)
        # Get OCR instance (use request language if provided)
        if request.language and request.language != OCR_LANGUAGE:
            ocr = get_ocr(request.language)
        else:
            ocr = get_ocr()
        result = ocr.ocr(image, cls=True)
        # Process results
        results = process_ocr_result(result)
        return OCRResponse(success=True, results=results)
    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))
@app.post("/ocr/upload", response_model=OCRResponse)
 async def ocr_upload(
    img: UploadFile = File(...),
    language: Optional[str] = Form(None)
 ):
    """
    Perform OCR on an uploaded image file
    Args:
        img: Uploaded image file
        language: Optional language code (default: env OCR_LANGUAGE)
    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Read image
        contents = await img.read()
        image = Image.open(io.BytesIO(contents))
        # Convert to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image_array = np.array(image)
        # Get OCR instance
        if language and language != OCR_LANGUAGE:
            ocr = get_ocr(language)
        else:
            ocr = get_ocr()
        result = ocr.ocr(image_array, cls=True)
        # Process results
        results = process_ocr_result(result)
        return OCRResponse(success=True, results=results)
    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -0,0 +1,371 @@
 #!/usr/bin/env python3
 """
 PaddleOCR-VL FastAPI Server (CPU variant)
 Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
 """
 import os
 import io
 import base64
 import logging
 import time
 from typing import Optional, List, Any, Dict, Union
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import torch
 from PIL import Image
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Environment configuration
 SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
 SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
 MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
 # Device configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {DEVICE}")
 # Task prompts for PaddleOCR-VL
 TASK_PROMPTS = {
    "ocr": "OCR:",
    "table": "Table Recognition:",
    "formula": "Formula Recognition:",
    "chart": "Chart Recognition:",
 }
 # Initialize FastAPI app
 app = FastAPI(
    title="PaddleOCR-VL Server",
    description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
    version="1.0.0"
 )
 # Global model instances
 model = None
 processor = None
 # Request/Response models (OpenAI-compatible)
 class ImageUrl(BaseModel):
    url: str
 class ContentItem(BaseModel):
    type: str
    text: Optional[str] = None
    image_url: Optional[ImageUrl] = None
 class Message(BaseModel):
    role: str
    content: Union[str, List[ContentItem]]
 class ChatCompletionRequest(BaseModel):
    model: str = "paddleocr-vl"
    messages: List[Message]
    temperature: Optional[float] = 0.0
    max_tokens: Optional[int] = 4096
 class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: str
 class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Usage
 class HealthResponse(BaseModel):
    status: str
    model: str
    device: str
 def load_model():
    """Load the PaddleOCR-VL model and processor"""
    global model, processor
    if model is not None:
        return
    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
    from transformers import AutoModelForCausalLM, AutoProcessor
    # Load processor
    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
    # Load model with appropriate settings for CPU/GPU
    if DEVICE == "cuda":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
        ).to(DEVICE).eval()
    else:
        # CPU mode - use float32 for compatibility
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
        ).eval()
    logger.info("PaddleOCR-VL model loaded successfully")
 def decode_image(image_source: str) -> Image.Image:
    """Decode image from URL or base64"""
    if image_source.startswith("data:"):
        # Base64 encoded image
        header, data = image_source.split(",", 1)
        image_data = base64.b64decode(data)
        return Image.open(io.BytesIO(image_data)).convert("RGB")
    elif image_source.startswith("http://") or image_source.startswith("https://"):
        # URL - fetch image
        import httpx
        response = httpx.get(image_source, timeout=30.0)
        response.raise_for_status()
        return Image.open(io.BytesIO(response.content)).convert("RGB")
    else:
        # Assume it's a file path or raw base64
        try:
            image_data = base64.b64decode(image_source)
            return Image.open(io.BytesIO(image_data)).convert("RGB")
        except:
            # Try as file path
            return Image.open(image_source).convert("RGB")
 def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
    """Extract image and text prompt from message content"""
    if isinstance(content, str):
        return None, content
    image = None
    text = ""
    for item in content:
        if item.type == "image_url" and item.image_url:
            image = decode_image(item.image_url.url)
        elif item.type == "text" and item.text:
            text = item.text
    return image, text
 def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
    """Generate response using PaddleOCR-VL"""
    load_model()
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    if DEVICE == "cuda":
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            use_cache=True
        )
    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract the assistant's response (after the prompt)
    if "assistant" in response.lower():
        parts = response.split("assistant")
        if len(parts) > 1:
            response = parts[-1].strip()
    return response
@app.on_event("startup")
 async def startup_event():
    """Pre-load the model on startup"""
    logger.info("Pre-loading PaddleOCR-VL model...")
    try:
        load_model()
        logger.info("Model pre-loaded successfully")
    except Exception as e:
        logger.error(f"Failed to pre-load model: {e}")
        # Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
 async def health_check():
    """Health check endpoint"""
    return HealthResponse(
        status="healthy" if model is not None else "loading",
        model=MODEL_NAME,
        device=DEVICE
    )
@app.get("/v1/models")
 async def list_models():
    """List available models (OpenAI-compatible)"""
    return {
        "object": "list",
        "data": [
            {
                "id": "paddleocr-vl",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "paddlepaddle"
            }
        ]
    }
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def chat_completions(request: ChatCompletionRequest):
    """
    OpenAI-compatible chat completions endpoint for PaddleOCR-VL
    Supports tasks:
    - "OCR:" - Text recognition
    - "Table Recognition:" - Table extraction
    - "Formula Recognition:" - Formula extraction
    - "Chart Recognition:" - Chart extraction
    """
    try:
        # Get the last user message
        user_message = None
        for msg in reversed(request.messages):
            if msg.role == "user":
                user_message = msg
                break
        if not user_message:
            raise HTTPException(status_code=400, detail="No user message found")
        # Extract image and prompt
        image, prompt = extract_image_and_text(user_message.content)
        if image is None:
            raise HTTPException(status_code=400, detail="No image provided in message")
        # Default to OCR if no specific prompt
        if not prompt or prompt.strip() == "":
            prompt = "OCR:"
        logger.info(f"Processing request with prompt: {prompt[:50]}...")
        # Generate response
        start_time = time.time()
        response_text = generate_response(image, prompt, request.max_tokens or 4096)
        elapsed = time.time() - start_time
        logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
        # Build OpenAI-compatible response
        return ChatCompletionResponse(
            id=f"chatcmpl-{int(time.time()*1000)}",
            created=int(time.time()),
            model=request.model,
            choices=[
                Choice(
                    index=0,
                    message=Message(role="assistant", content=response_text),
                    finish_reason="stop"
                )
            ],
            usage=Usage(
                prompt_tokens=100,  # Approximate
                completion_tokens=len(response_text) // 4,
                total_tokens=100 + len(response_text) // 4
            )
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Legacy endpoint for compatibility with old PaddleOCR API
 class LegacyOCRRequest(BaseModel):
    image: str
    task: Optional[str] = "ocr"
 class LegacyOCRResponse(BaseModel):
    success: bool
    result: str
    task: str
    error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
 async def legacy_ocr(request: LegacyOCRRequest):
    """
    Legacy OCR endpoint for backwards compatibility
    Tasks: ocr, table, formula, chart
    """
    try:
        image = decode_image(request.image)
        prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
        result = generate_response(image, prompt)
        return LegacyOCRResponse(
            success=True,
            result=result,
            task=request.task
        )
    except Exception as e:
        logger.error(f"Legacy OCR error: {e}")
        return LegacyOCRResponse(
            success=False,
            result="",
            task=request.task,
            error=str(e)
        )
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@host.today/ht-docker-ai",
-  "version": "1.4.0",
+  "version": "1.5.0",
  "type": "module",
  "private": false,
  "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
@@ -13,8 +13,8 @@
    "test": "tstest test/ --verbose"
  },
  "devDependencies": {
-    "@git.zone/tstest": "^1.0.90",
+    "@git.zone/tsrun": "^1.3.3",
-    "@git.zone/tsrun": "^1.3.3"
+    "@git.zone/tstest": "^1.0.90"
  },
  "repository": {
    "type": "git",
@@ -28,5 +28,8 @@
    "minicpm",
    "ollama",
    "multimodal"
-  ]
+  ],
  "dependencies": {
    "@types/node": "^25.0.9"
  }
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -7,6 +7,10 @@ settings:
 importers:
  .:
    dependencies:
      '@types/node':
        specifier: ^25.0.9
        version: 25.0.9
    devDependencies:
      '@git.zone/tsrun':
        specifier: ^1.3.3
--- a/readme.hints.md
+++ b/readme.hints.md
@@ -77,56 +77,73 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 CPU variant has longer `start-period` (120s) due to slower startup.
-## PaddleOCR
+## PaddleOCR-VL (Recommended)
 ### Overview
-PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides:
+PaddleOCR-VL is a 0.9B parameter Vision-Language Model specifically optimized for document parsing. It replaces the older PP-Structure approach with native VLM understanding.
- Text detection and recognition
+**Key advantages over PP-Structure:**
- Multi-language support
+- Native table understanding (no HTML parsing needed)
- FastAPI REST API
+- 109 language support
- GPU and CPU variants
+- Better handling of complex multi-row tables
 - Structured Markdown/JSON output
 ### Docker Images
 | Tag | Description |
 |-----|-------------|
-| `paddleocr` | GPU variant (default) |
+| `paddleocr-vl` | GPU variant using vLLM (recommended) |
-| `paddleocr-gpu` | GPU variant (alias) |
+| `paddleocr-vl-cpu` | CPU variant using transformers |
 | `paddleocr-cpu` | CPU-only variant |
-### API Endpoints
+### API Endpoints (OpenAI-compatible)
 | Endpoint | Method | Description |
 |----------|--------|-------------|
 | `/health` | GET | Health check with model info |
-| `/ocr` | POST | OCR with base64 image (JSON body) |
+| `/v1/models` | GET | List available models |
-| `/ocr/upload` | POST | OCR with file upload (multipart form) |
+| `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
 | `/ocr` | POST | Legacy OCR endpoint |
 ### Request/Response Format
-**POST /ocr (JSON)**
+**POST /v1/chat/completions (OpenAI-compatible)**
 ```json
 {
-  "image": "<base64-encoded-image>",
+  "model": "paddleocr-vl",
-  "language": "en"  // optional
+  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
        {"type": "text", "text": "Table Recognition:"}
      ]
    }
  ],
  "temperature": 0.0,
  "max_tokens": 8192
 }
 ```
-**POST /ocr/upload (multipart)**
+**Task Prompts:**
- `img`: image file
+- `"OCR:"` - Text recognition
- `language`: optional language code
+- `"Table Recognition:"` - Table extraction (returns markdown)
 - `"Formula Recognition:"` - Formula extraction
 - `"Chart Recognition:"` - Chart extraction
 **Response**
 ```json
 {
-  "success": true,
+  "id": "chatcmpl-...",
-  "results": [
+  "object": "chat.completion",
  "choices": [
    {
-      "text": "Invoice #12345",
+      "index": 0,
-      "confidence": 0.98,
+      "message": {
-      "box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        "role": "assistant",
        "content": "| Date | Description | Amount |\n|---|---|---|\n| 2021-06-01 | GITLAB INC | -119.96 |"
      },
      "finish_reason": "stop"
    }
  ]
 }
@@ -136,19 +153,16 @@ PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It pr
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `OCR_LANGUAGE` | `en` | Default language for OCR |
+| `MODEL_NAME` | `PaddlePaddle/PaddleOCR-VL` | Model to load |
-| `SERVER_PORT` | `5000` | Server port |
+| `HOST` | `0.0.0.0` | Server host |
-| `SERVER_HOST` | `0.0.0.0` | Server host |
+| `PORT` | `8000` | Server port |
-| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only |
+| `MAX_BATCHED_TOKENS` | `16384` | vLLM max batch tokens |
 | `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory usage (0-1) |
 ### Performance
- **GPU**: ~1-3 seconds per page
+- **GPU (vLLM)**: ~2-5 seconds per page
- **CPU**: ~10-30 seconds per page
+- **CPU**: ~30-60 seconds per page
 ### Supported Languages
 Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
 ---
@@ -193,6 +207,43 @@ npmci docker build
 npmci docker push code.foss.global
 ```
 ## Multi-Pass Extraction Strategy
 The bank statement extraction uses a dual-VLM consensus approach:
 ### Architecture: Dual-VLM Consensus
 | VLM | Model | Purpose |
 |-----|-------|---------|
 | **MiniCPM-V 4.5** | 8B params | Primary visual extraction |
 | **PaddleOCR-VL** | 0.9B params | Table-specialized extraction |
 ### Extraction Strategy
 1. **Pass 1**: MiniCPM-V visual extraction (images → JSON)
 2. **Pass 2**: PaddleOCR-VL table recognition (images → markdown → JSON)
 3. **Consensus**: If Pass 1 == Pass 2 → Done (fast path)
 4. **Pass 3+**: MiniCPM-V visual if no consensus
 ### Why Dual-VLM Works
 - **Different architectures**: Two independent models cross-check each other
 - **Specialized strengths**: PaddleOCR-VL optimized for tables, MiniCPM-V for general vision
 - **No structure loss**: Both VLMs see the original images directly
 - **Fast consensus**: Most documents complete in 2 passes when VLMs agree
 ### Comparison vs Old PP-Structure Approach
 | Approach | Bank Statement Result | Issue |
 |----------|----------------------|-------|
 | MiniCPM-V Visual | 28 transactions ✓ | - |
 | PP-Structure HTML + Visual | 13 transactions ✗ | HTML merged rows incorrectly |
 | PaddleOCR-VL Table | 28 transactions ✓ | Native table understanding |
 **Key insight**: PP-Structure's HTML output loses structure for complex tables. PaddleOCR-VL's native VLM approach maintains table integrity.
 ---
 ## Related Resources
 - [Ollama Documentation](https://ollama.ai/docs)
--- a/test/test.bankstatements.combined.ts
+++ b/test/test.bankstatements.combined.ts
@@ -4,12 +4,16 @@ import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 // Service URLs
 const OLLAMA_URL = 'http://localhost:11434';
-const MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';
 const PADDLEOCR_URL = 'http://localhost:5000';
-// Prompt for visual extraction (with images)
+// Models
-const VISUAL_EXTRACT_PROMPT = `/nothink
+const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
 const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
 // Prompt for MiniCPM-V visual extraction
 const MINICPM_EXTRACT_PROMPT = `/nothink
 You are a bank statement parser. Extract EVERY transaction from the table.
 Read the Amount column carefully:
@@ -21,9 +25,12 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 Do not skip any rows. Return ONLY the JSON array, no explanation.`;
-// Prompt for OCR-only extraction (no images)
+// Prompt for PaddleOCR-VL table extraction
-const OCR_EXTRACT_PROMPT = `/nothink
+const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
-You are a bank statement parser. Extract EVERY transaction from the OCR text below.
+
 // Post-processing prompt to convert PaddleOCR-VL output to JSON
 const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
 Convert the following bank statement table data to JSON.
 Read the Amount values carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
@@ -32,48 +39,12 @@ Read the Amount values carefully:
 For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
-Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
+Return ONLY the JSON array, no explanation.
-/**
+Table data:
 * Build prompt for OCR-only extraction (no images)
 */
 function buildOcrOnlyPrompt(ocrText: string): string {
  // Limit OCR text to prevent context overflow
  const maxOcrLength = 12000;
  const truncatedOcr = ocrText.length > maxOcrLength
    ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
    : ocrText;
  return `${OCR_EXTRACT_PROMPT}
 OCR text from bank statement:
 ---
-${truncatedOcr}
+{TABLE_DATA}
 ---`;
 }
 /**
 * Extract OCR text from an image using PaddleOCR
 */
 async function extractOcrText(imageBase64: string): Promise<string> {
  try {
    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ image: imageBase64 }),
    });
    if (!response.ok) return '';
    const data = await response.json();
    if (data.success && data.results) {
      return data.results.map((r: { text: string }) => r.text).join('\n');
    }
  } catch {
    // PaddleOCR unavailable
  }
  return '';
 }
 interface ITransaction {
  date: string;
@@ -94,7 +65,7 @@ function convertPdfToImages(pdfPath: string): string[] {
      { stdio: 'pipe' }
    );
-    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
+    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];
    for (const file of files) {
@@ -110,12 +81,12 @@ function convertPdfToImages(pdfPath: string): string[] {
 }
 /**
- * Visual extraction pass (with images)
+ * Extract using MiniCPM-V via Ollama
 */
-async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
+async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
  const payload = {
-    model: MODEL,
+    model: MINICPM_MODEL,
-    prompt: VISUAL_EXTRACT_PROMPT,
+    prompt: MINICPM_EXTRACT_PROMPT,
    images,
    stream: true,
    options: {
@@ -124,31 +95,6 @@ async function extractVisual(images: string[], passLabel: string): Promise<ITran
    },
  };
  return doExtraction(payload, passLabel);
 }
 /**
 * OCR-only extraction pass (no images, just text)
 */
 async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
  const payload = {
    model: MODEL,
    prompt: buildOcrOnlyPrompt(ocrText),
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  return doExtraction(payload, passLabel);
 }
 /**
 * Common extraction logic
 */
 async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
@@ -168,7 +114,7 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
  let fullText = '';
  let lineBuffer = '';
-  console.log(`[${passLabel}] Extracting...`);
+  console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
  while (true) {
    const { done, value } = await reader.read();
@@ -184,7 +130,6 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
          fullText += json.response;
          lineBuffer += json.response;
          // Print complete lines
          if (lineBuffer.includes('\n')) {
            const parts = lineBuffer.split('\n');
            for (let i = 0; i < parts.length - 1; i++) {
@@ -214,6 +159,140 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Extract table using PaddleOCR-VL via OpenAI-compatible API
 */
 async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
  const payload = {
    model: PADDLEOCR_VL_MODEL,
    messages: [
      {
        role: 'user',
        content: [
          {
            type: 'image_url',
            image_url: { url: `data:image/png;base64,${imageBase64}` },
          },
          {
            type: 'text',
            text: PADDLEOCR_VL_TABLE_PROMPT,
          },
        ],
      },
    ],
    temperature: 0.0,
    max_tokens: 8192,
  };
  const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    const text = await response.text();
    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
  }
  const data = await response.json();
  return data.choices?.[0]?.message?.content || '';
 }
 /**
 * Convert PaddleOCR-VL table output to transactions using MiniCPM-V
 */
 async function convertTableToTransactions(
  tableData: string,
  passLabel: string
 ): Promise<ITransaction[]> {
  const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
  const payload = {
    model: MINICPM_MODEL,
    prompt,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };
  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });
  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }
  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }
  const decoder = new TextDecoder();
  let fullText = '';
  console.log(`[${passLabel}] Converting table data to JSON...`);
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());
    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;
  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error('No JSON array found in response');
  }
  return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 /**
 * Extract using PaddleOCR-VL (table recognition) + conversion
 */
 async function extractWithPaddleOCRVL(
  images: string[],
  passLabel: string
 ): Promise<ITransaction[]> {
  console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
  // Extract table data from each page
  const tableDataParts: string[] = [];
  for (let i = 0; i < images.length; i++) {
    console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
    const tableData = await extractTableWithPaddleOCRVL(images[i]);
    if (tableData.trim()) {
      tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
    }
  }
  const combinedTableData = tableDataParts.join('\n\n');
  console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
  // Convert to transactions
  return convertTableToTransactions(combinedTableData, passLabel);
 }
 /**
 * Create a hash of transactions for comparison
 */
@@ -225,10 +304,31 @@ function hashTransactions(transactions: ITransaction[]): string {
 }
 /**
- * Extract with majority voting - run until 2 passes match
+ * Check if PaddleOCR-VL service is available
 * Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
 */
-async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
+async function isPaddleOCRVLAvailable(): Promise<boolean> {
  try {
    const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
      method: 'GET',
      signal: AbortSignal.timeout(5000),
    });
    return response.ok;
  } catch {
    return false;
  }
 }
 /**
 * Extract with dual-VLM consensus
 * Strategy:
 *   Pass 1 = MiniCPM-V visual extraction
 *   Pass 2 = PaddleOCR-VL table recognition (if available)
 *   Pass 3+ = MiniCPM-V visual (fallback)
 */
 async function extractWithConsensus(
  images: string[],
  maxPasses: number = 5
 ): Promise<ITransaction[]> {
  const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
  const hashCounts: Map<string, number> = new Map();
@@ -236,59 +336,48 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
    const hash = hashTransactions(transactions);
    results.push({ transactions, hash });
    hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
-    console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
+    console.log(
      `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
    );
    return hashCounts.get(hash)!;
  };
-  // Run Pass 1 (Visual) in parallel with OCR extraction
+  // Check if PaddleOCR-VL is available
-  let ocrText = '';
+  const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
-  const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));
+  if (paddleOCRVLAvailable) {
-
+    console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
  // Extract OCR from all pages
  const ocrPromise = (async () => {
    const ocrTexts: string[] = [];
    for (let i = 0; i < images.length; i++) {
      const pageOcr = await extractOcrText(images[i]);
      if (pageOcr) {
        ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
      }
    }
    ocrText = ocrTexts.join('\n\n');
    if (ocrText) {
      console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
    }
    return ocrText;
  })();
  // Wait for Pass 1 and OCR to complete
  const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
  // Process Pass 1 result
  if ('error' in pass1Result) {
    console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
  } else {
-    addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
+    console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
  }
-  // Pass 2: OCR-only (no images) - faster, different approach
+  // Pass 1: MiniCPM-V visual extraction
  if (ocrText) {
  try {
-      const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
+    const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
-      const count = addResult(pass2Result, 'Pass 2 OCR-only');
+    addResult(pass1Result, 'Pass 1 MiniCPM-V');
  } catch (err) {
    console.log(`[Pass 1] Error: ${err}`);
  }
  // Pass 2: PaddleOCR-VL table recognition (if available)
  if (paddleOCRVLAvailable) {
    try {
      const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
      const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
      if (count >= 2) {
-        console.log(`[Consensus] Visual and OCR extractions match!`);
+        console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
        return pass2Result;
      }
    } catch (err) {
-      console.log(`[Pass 2 OCR-only] Error: ${err}`);
+      console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
    }
  }
-  // Continue with visual passes 3+ if no consensus yet
+  // Pass 3+: Continue with MiniCPM-V visual passes
-  for (let pass = 3; pass <= maxPasses; pass++) {
+  const startPass = paddleOCRVLAvailable ? 3 : 2;
  for (let pass = startPass; pass <= maxPasses; pass++) {
    try {
-      const transactions = await extractVisual(images, `Pass ${pass} Visual`);
+      const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
-      const count = addResult(transactions, `Pass ${pass} Visual`);
+      const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
      if (count >= 2) {
        console.log(`[Consensus] Reached after ${pass} passes`);
@@ -368,7 +457,7 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
  }
  const files = fs.readdirSync(testDir);
-  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
+  const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
  for (const pdf of pdfFiles) {
@@ -402,6 +491,13 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 tap.test('should check PaddleOCR-VL availability', async () => {
  const available = await isPaddleOCRVLAvailable();
  console.log(`PaddleOCR-VL available: ${available}`);
  // This test passes regardless - PaddleOCR-VL is optional
  expect(true).toBeTrue();
 });
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 for (const testCase of testCases) {
@@ -416,7 +512,7 @@ for (const testCase of testCases) {
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`Converted: ${images.length} pages\n`);
-    // Extract with consensus voting
+    // Extract with dual-VLM consensus
    const extracted = await extractWithConsensus(images);
    console.log(`\nFinal: ${extracted.length} transactions`);
--- a/test/test.invoices.combined.ts
+++ b/test/test.invoices.combined.ts
@@ -6,7 +6,7 @@ import * as os from 'os';
 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';
 interface IInvoice {
  invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
 }
 /**
- * Extract OCR text from an image using PaddleOCR
+ * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
 */
 async function extractOcrText(imageBase64: string): Promise<string> {
  try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
+      body: JSON.stringify({
        model: 'paddleocr-vl',
        messages: [{
          role: 'user',
          content: [
            { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
            { type: 'text', text: 'OCR:' }
          ]
        }],
        temperature: 0.0,
        max_tokens: 4096
      }),
    });
    if (!response.ok) return '';
    const data = await response.json();
-    if (data.success && data.results) {
+    return data.choices?.[0]?.message?.content || '';
      return data.results.map((r: { text: string }) => r.text).join('\n');
    }
  } catch {
-    // PaddleOCR unavailable
+    // PaddleOCR-VL unavailable
  }
  return '';
 }
--- a/test/test.paddleocr.ts
+++ b/test/test.paddleocr.ts
@@ -1,258 +0,0 @@
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 const PADDLEOCR_URL = 'http://localhost:5000';
 interface IOCRResult {
  text: string;
  confidence: number;
  box: number[][];
 }
 interface IOCRResponse {
  success: boolean;
  results: IOCRResult[];
  error?: string;
 }
 interface IHealthResponse {
  status: string;
  model: string;
  language: string;
  gpu_enabled: boolean;
 }
 /**
 * Convert PDF first page to PNG using ImageMagick
 */
 function convertPdfToImage(pdfPath: string): string {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPath = path.join(tempDir, 'page.png');
  try {
    execSync(
      `convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
      { stdio: 'pipe' }
    );
    const imageData = fs.readFileSync(outputPath);
    return imageData.toString('base64');
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 /**
 * Create a simple test image with text using ImageMagick
 */
 function createTestImage(text: string): string {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
  const outputPath = path.join(tempDir, 'test.png');
  try {
    execSync(
      `convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
      { stdio: 'pipe' }
    );
    const imageData = fs.readFileSync(outputPath);
    return imageData.toString('base64');
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
 }
 // Health check test
 tap.test('should respond to health check', async () => {
  const response = await fetch(`${PADDLEOCR_URL}/health`);
  expect(response.ok).toBeTrue();
  const data: IHealthResponse = await response.json();
  expect(data.status).toEqual('healthy');
  expect(data.model).toEqual('PP-OCRv4');
  expect(data.language).toBeTypeofString();
  expect(data.gpu_enabled).toBeTypeofBoolean();
  console.log(`PaddleOCR Status: ${data.status}`);
  console.log(`  Model: ${data.model}`);
  console.log(`  Language: ${data.language}`);
  console.log(`  GPU Enabled: ${data.gpu_enabled}`);
 });
 // Base64 OCR test
 tap.test('should perform OCR on base64 image', async () => {
  // Create a test image with known text
  const testText = 'Hello World 12345';
  console.log(`Creating test image with text: "${testText}"`);
  const imageBase64 = createTestImage(testText);
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: imageBase64 }),
  });
  expect(response.ok).toBeTrue();
  const data: IOCRResponse = await response.json();
  expect(data.success).toBeTrue();
  expect(data.results).toBeArray();
  const extractedText = data.results.map((r) => r.text).join(' ');
  console.log(`Extracted text: "${extractedText}"`);
  // Check that we got some text back
  expect(data.results.length).toBeGreaterThan(0);
  // Check that at least some of the expected text was found
  const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
  const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
  const hasPartialMatch =
    normalizedExtracted.includes('hello') ||
    normalizedExtracted.includes('world') ||
    normalizedExtracted.includes('12345');
  expect(hasPartialMatch).toBeTrue();
 });
 // File upload OCR test
 tap.test('should perform OCR via file upload', async () => {
  const testText = 'Invoice Number 98765';
  console.log(`Creating test image with text: "${testText}"`);
  const imageBase64 = createTestImage(testText);
  const imageBuffer = Buffer.from(imageBase64, 'base64');
  const formData = new FormData();
  const blob = new Blob([imageBuffer], { type: 'image/png' });
  formData.append('img', blob, 'test.png');
  const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
    method: 'POST',
    body: formData,
  });
  expect(response.ok).toBeTrue();
  const data: IOCRResponse = await response.json();
  expect(data.success).toBeTrue();
  expect(data.results).toBeArray();
  const extractedText = data.results.map((r) => r.text).join(' ');
  console.log(`Extracted text: "${extractedText}"`);
  // Check that we got some text back
  expect(data.results.length).toBeGreaterThan(0);
 });
 // OCR result structure test
 tap.test('should return proper OCR result structure', async () => {
  const testText = 'Test 123';
  const imageBase64 = createTestImage(testText);
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: imageBase64 }),
  });
  const data: IOCRResponse = await response.json();
  if (data.results.length > 0) {
    const result = data.results[0];
    // Check result has required fields
    expect(result.text).toBeTypeofString();
    expect(result.confidence).toBeTypeofNumber();
    expect(result.box).toBeArray();
    // Check bounding box structure (4 points, each with x,y)
    expect(result.box.length).toEqual(4);
    for (const point of result.box) {
      expect(point.length).toEqual(2);
      expect(point[0]).toBeTypeofNumber();
      expect(point[1]).toBeTypeofNumber();
    }
    // Confidence should be between 0 and 1
    expect(result.confidence).toBeGreaterThan(0);
    expect(result.confidence).toBeLessThanOrEqual(1);
    console.log(`Result structure valid:`);
    console.log(`  Text: "${result.text}"`);
    console.log(`  Confidence: ${(result.confidence * 100).toFixed(1)}%`);
    console.log(`  Box: ${JSON.stringify(result.box)}`);
  }
 });
 // Test with actual invoice if available
 const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
 if (fs.existsSync(invoiceDir)) {
  const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
  if (pdfFiles.length > 0) {
    const testPdf = pdfFiles[0];
    tap.test(`should extract text from invoice: ${testPdf}`, async () => {
      const pdfPath = path.join(invoiceDir, testPdf);
      console.log(`Converting ${testPdf} to image...`);
      const imageBase64 = convertPdfToImage(pdfPath);
      console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
      const startTime = Date.now();
      const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ image: imageBase64 }),
      });
      const endTime = Date.now();
      const elapsedMs = endTime - startTime;
      expect(response.ok).toBeTrue();
      const data: IOCRResponse = await response.json();
      expect(data.success).toBeTrue();
      console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
      console.log(`Found ${data.results.length} text regions`);
      // Print first 10 results
      const preview = data.results.slice(0, 10);
      console.log(`\nFirst ${preview.length} results:`);
      for (const result of preview) {
        console.log(`  [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
      }
      if (data.results.length > 10) {
        console.log(`  ... and ${data.results.length - 10} more`);
      }
      // Should find text in an invoice
      expect(data.results.length).toBeGreaterThan(5);
    });
  }
 }
 // Error handling test
 tap.test('should handle invalid base64 gracefully', async () => {
  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ image: 'not-valid-base64!!!' }),
  });
  const data: IOCRResponse = await response.json();
  // Should return success: false with error message
  expect(data.success).toBeFalse();
  expect(data.error).toBeTypeofString();
  console.log(`Error handling works: ${data.error}`);
 });
 export default tap.start();
Author	SHA1	Message	Date
Juergen Kunz	addae20cbd	v1.5.0 Some checks failed Docker (tags) / security (push) Successful in 31s Details Docker (tags) / test (push) Failing after 40s Details Docker (tags) / release (push) Has been skipped Details Docker (tags) / metadata (push) Has been skipped Details	2026-01-17 16:57:26 +00:00
Juergen Kunz	0482c35b69	feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests	2026-01-17 16:57:26 +00:00
Juergen Kunz	15ac1fcf67	update	2026-01-16 16:21:44 +00:00