diff --git a/Dockerfile_paddleocr b/Dockerfile_paddleocr
deleted file mode 100644
index fe0e681..0000000
--- a/Dockerfile_paddleocr
+++ /dev/null
@@ -1,49 +0,0 @@
-# PaddleOCR GPU Variant
-# OCR processing with NVIDIA GPU support using PaddlePaddle
-FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
-
-LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
-LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
-LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
-
-# Environment configuration
-ENV OCR_LANGUAGE="en"
-ENV SERVER_PORT="5000"
-ENV SERVER_HOST="0.0.0.0"
-ENV PYTHONUNBUFFERED=1
-
-# Set working directory
-WORKDIR /app
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libgl1-mesa-glx \
-    libglib2.0-0 \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Python dependencies (using stable paddleocr 2.x)
-RUN pip install --no-cache-dir \
-    paddleocr==2.8.1 \
-    fastapi \
-    uvicorn[standard] \
-    python-multipart \
-    opencv-python-headless \
-    pillow
-
-# Copy server files
-COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
-COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
-RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
-
-# Note: OCR models will be downloaded on first run
-# This ensures compatibility across different GPU architectures
-
-# Expose API port
-EXPOSE 5000
-
-# Health check
-HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-    CMD curl -f http://localhost:5000/health || exit 1
-
-ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
diff --git a/Dockerfile_paddleocr_cpu b/Dockerfile_paddleocr_cpu
deleted file mode 100644
index 36386d8..0000000
--- a/Dockerfile_paddleocr_cpu
+++ /dev/null
@@ -1,53 +0,0 @@
-# PaddleOCR CPU Variant
-# OCR processing optimized for CPU-only inference
-FROM python:3.10-slim-bookworm
-
-LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
-LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
-LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
-
-# Environment configuration for CPU-only mode
-ENV OCR_LANGUAGE="en"
-ENV SERVER_PORT="5000"
-ENV SERVER_HOST="0.0.0.0"
-ENV PYTHONUNBUFFERED=1
-# Disable GPU usage for CPU-only variant
-ENV CUDA_VISIBLE_DEVICES="-1"
-
-# Set working directory
-WORKDIR /app
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libgl1-mesa-glx \
-    libglib2.0-0 \
-    libgomp1 \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
-RUN pip install --no-cache-dir \
-    paddlepaddle==2.6.2 \
-    paddleocr==2.8.1 \
-    fastapi \
-    uvicorn[standard] \
-    python-multipart \
-    opencv-python-headless \
-    pillow
-
-# Copy server files
-COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
-COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
-RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
-
-# Note: OCR models will be downloaded on first run
-# This avoids build-time segfaults with certain CPU architectures
-
-# Expose API port
-EXPOSE 5000
-
-# Health check (longer start-period for CPU variant)
-HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
-    CMD curl -f http://localhost:5000/health || exit 1
-
-ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]
diff --git a/Dockerfile_paddleocr_vl b/Dockerfile_paddleocr_vl
new file mode 100644
index 0000000..4be04e7
--- /dev/null
+++ b/Dockerfile_paddleocr_vl
@@ -0,0 +1,72 @@
+# PaddleOCR-VL GPU Variant
+# Vision-Language Model for document parsing using vLLM
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+
+LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
+LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
+LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
+
+# Environment configuration
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/root/.cache/huggingface
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
+    git \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+
+# Create and activate virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install PyTorch with CUDA support
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    torch==2.5.1 \
+    torchvision \
+    --index-url https://download.pytorch.org/whl/cu124
+
+# Install vLLM (nightly for PaddleOCR-VL support)
+RUN pip install --no-cache-dir \
+    vllm \
+    --pre \
+    --extra-index-url https://wheels.vllm.ai/nightly \
+    --extra-index-url https://download.pytorch.org/whl/cu124
+
+# Install additional dependencies
+RUN pip install --no-cache-dir \
+    transformers \
+    accelerate \
+    safetensors \
+    pillow \
+    fastapi \
+    uvicorn[standard] \
+    python-multipart \
+    openai \
+    httpx
+
+# Copy entrypoint script
+COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
+RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
+
+# Expose vLLM API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
diff --git a/Dockerfile_paddleocr_vl_cpu b/Dockerfile_paddleocr_vl_cpu
new file mode 100644
index 0000000..206c615
--- /dev/null
+++ b/Dockerfile_paddleocr_vl_cpu
@@ -0,0 +1,54 @@
+# PaddleOCR-VL CPU Variant
+# Vision-Language Model for document parsing using transformers (slower, no GPU required)
+FROM python:3.11-slim-bookworm
+
+LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
+LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing"
+LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
+
+# Environment configuration
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/root/.cache/huggingface
+ENV CUDA_VISIBLE_DEVICES=""
+ENV SERVER_PORT=8000
+ENV SERVER_HOST=0.0.0.0
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir \
+    transformers \
+    accelerate \
+    safetensors \
+    pillow \
+    fastapi \
+    uvicorn[standard] \
+    python-multipart \
+    httpx
+
+# Copy server files
+COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
+COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
+RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
+
+# Expose API port
+EXPOSE 8000
+
+# Health check (longer start-period for CPU + model download)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"]
diff --git a/build-images.sh b/build-images.sh
index cfb3a7b..44f2d98 100755
--- a/build-images.sh
+++ b/build-images.sh
@@ -29,19 +29,19 @@ docker build \
     -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
     .
 
-# Build PaddleOCR GPU variant
-echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}"
+# Build PaddleOCR-VL GPU variant (vLLM)
+echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
 docker build \
-    -f Dockerfile_paddleocr \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \
+    -f Dockerfile_paddleocr_vl \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
     .
 
-# Build PaddleOCR CPU variant
-echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}"
+# Build PaddleOCR-VL CPU variant
+echo -e "${GREEN}Building PaddleOCR-VL CPU variant...${NC}"
 docker build \
-    -f Dockerfile_paddleocr_cpu \
-    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \
+    -f Dockerfile_paddleocr_vl_cpu \
+    -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu \
     .
 
 echo -e "${GREEN}All images built successfully!${NC}"
@@ -52,7 +52,7 @@ echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
 echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
 echo ""
-echo "  PaddleOCR:"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)"
-echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)"
+echo "  PaddleOCR-VL (Vision-Language Model):"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl (GPU/vLLM)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu (GPU/vLLM)"
+echo "  - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu (CPU)"
diff --git a/image_support_files/paddleocr-entrypoint.sh b/image_support_files/paddleocr-entrypoint.sh
deleted file mode 100644
index 3ab8d5b..0000000
--- a/image_support_files/paddleocr-entrypoint.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-# Configuration from environment
-OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
-SERVER_PORT="${SERVER_PORT:-5000}"
-SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
-
-echo "Starting PaddleOCR Server..."
-echo "  Language: ${OCR_LANGUAGE}"
-echo "  Host: ${SERVER_HOST}"
-echo "  Port: ${SERVER_PORT}"
-
-# Check GPU availability
-if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
-    echo "  GPU: Disabled (CPU mode)"
-else
-    echo "  GPU: Enabled"
-fi
-
-# Start the FastAPI server with uvicorn
-exec python -m uvicorn paddleocr_server:app \
-    --host "${SERVER_HOST}" \
-    --port "${SERVER_PORT}" \
-    --workers 1
diff --git a/image_support_files/paddleocr-vl-cpu-entrypoint.sh b/image_support_files/paddleocr-vl-cpu-entrypoint.sh
new file mode 100644
index 0000000..fc23695
--- /dev/null
+++ b/image_support_files/paddleocr-vl-cpu-entrypoint.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+echo "==================================="
+echo "PaddleOCR-VL Server (CPU)"
+echo "==================================="
+
+HOST="${SERVER_HOST:-0.0.0.0}"
+PORT="${SERVER_PORT:-8000}"
+
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Device: CPU (no GPU)"
+echo ""
+
+echo "Starting PaddleOCR-VL CPU server..."
+echo "==================================="
+
+exec python /app/paddleocr_vl_server.py
diff --git a/image_support_files/paddleocr-vl-entrypoint.sh b/image_support_files/paddleocr-vl-entrypoint.sh
new file mode 100644
index 0000000..1978b9a
--- /dev/null
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+echo "==================================="
+echo "PaddleOCR-VL Server"
+echo "==================================="
+
+# Configuration
+MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
+HOST="${HOST:-0.0.0.0}"
+PORT="${PORT:-8000}"
+MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
+
+echo "Model: ${MODEL_NAME}"
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
+echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
+echo ""
+
+# Check GPU availability
+if command -v nvidia-smi &> /dev/null; then
+    echo "GPU Information:"
+    nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+    echo ""
+else
+    echo "WARNING: nvidia-smi not found. GPU may not be available."
+fi
+
+echo "Starting vLLM server..."
+echo "==================================="
+
+# Start vLLM server with PaddleOCR-VL
+exec vllm serve "${MODEL_NAME}" \
+    --trust-remote-code \
+    --host "${HOST}" \
+    --port "${PORT}" \
+    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
+    --no-enable-prefix-caching \
+    --mm-processor-cache-gb 0 \
+    --served-model-name "paddleocr-vl"
diff --git a/image_support_files/paddleocr_server.py b/image_support_files/paddleocr_server.py
deleted file mode 100644
index f4650e9..0000000
--- a/image_support_files/paddleocr_server.py
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-"""
-PaddleOCR FastAPI Server
-Provides REST API for OCR operations using PaddleOCR
-"""
-
-import os
-import io
-import base64
-import logging
-from typing import Optional, List, Any
-
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-import numpy as np
-from PIL import Image
-from paddleocr import PaddleOCR
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# Environment configuration
-OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
-# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
-USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
-
-# Initialize FastAPI app
-app = FastAPI(
-    title="PaddleOCR Server",
-    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
-    version="1.0.0"
-)
-
-# Global OCR instance
-ocr_instance: Optional[PaddleOCR] = None
-
-
-class OCRRequest(BaseModel):
-    """Request model for base64 image OCR"""
-    image: str
-    language: Optional[str] = None
-
-
-class BoundingBox(BaseModel):
-    """Bounding box for detected text"""
-    points: List[List[float]]
-
-
-class OCRResult(BaseModel):
-    """Single OCR detection result"""
-    text: str
-    confidence: float
-    box: List[List[float]]
-
-
-class OCRResponse(BaseModel):
-    """OCR response model"""
-    success: bool
-    results: List[OCRResult]
-    error: Optional[str] = None
-
-
-class HealthResponse(BaseModel):
-    """Health check response"""
-    status: str
-    model: str
-    language: str
-    gpu_enabled: bool
-
-
-def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
-    """Get or initialize the OCR instance"""
-    global ocr_instance
-    use_lang = lang or OCR_LANGUAGE
-
-    # Return cached instance if same language
-    if ocr_instance is not None and lang is None:
-        return ocr_instance
-
-    logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
-    new_ocr = PaddleOCR(
-        use_angle_cls=True,
-        lang=use_lang,
-        use_gpu=USE_GPU,
-        show_log=False
-    )
-
-    # Cache the default language instance
-    if lang is None:
-        ocr_instance = new_ocr
-
-    logger.info("PaddleOCR initialized successfully")
-    return new_ocr
-
-
-def decode_base64_image(base64_string: str) -> np.ndarray:
-    """Decode base64 string to numpy array"""
-    # Remove data URL prefix if present
-    if ',' in base64_string:
-        base64_string = base64_string.split(',')[1]
-
-    image_data = base64.b64decode(base64_string)
-    image = Image.open(io.BytesIO(image_data))
-
-    # Convert to RGB if necessary
-    if image.mode != 'RGB':
-        image = image.convert('RGB')
-
-    return np.array(image)
-
-
-def process_ocr_result(result: Any) -> List[OCRResult]:
-    """Process PaddleOCR result into structured format"""
-    results = []
-
-    if result is None or len(result) == 0:
-        return results
-
-    # PaddleOCR returns list of results per image
-    # Each result is a list of [box, (text, confidence)]
-    for line in result[0] if result[0] else []:
-        if line is None:
-            continue
-
-        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
-        text_info = line[1]  # (text, confidence)
-
-        results.append(OCRResult(
-            text=text_info[0],
-            confidence=float(text_info[1]),
-            box=[[float(p[0]), float(p[1])] for p in box]
-        ))
-
-    return results
-
-
-@app.on_event("startup")
-async def startup_event():
-    """Pre-warm the OCR model on startup"""
-    logger.info("Pre-warming OCR model...")
-    try:
-        ocr = get_ocr()
-        # Create a small test image to warm up the model
-        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
-        test_image.fill(255)  # White image
-        ocr.ocr(test_image, cls=True)
-        logger.info("OCR model pre-warmed successfully")
-    except Exception as e:
-        logger.error(f"Failed to pre-warm OCR model: {e}")
-
-
-@app.get("/health", response_model=HealthResponse)
-async def health_check():
-    """Health check endpoint"""
-    try:
-        # Ensure OCR is initialized
-        get_ocr()
-        return HealthResponse(
-            status="healthy",
-            model="PP-OCRv4",
-            language=OCR_LANGUAGE,
-            gpu_enabled=USE_GPU
-        )
-    except Exception as e:
-        logger.error(f"Health check failed: {e}")
-        raise HTTPException(status_code=503, detail=str(e))
-
-
-@app.post("/ocr", response_model=OCRResponse)
-async def ocr_base64(request: OCRRequest):
-    """
-    Perform OCR on a base64-encoded image
-
-    Args:
-        request: OCRRequest with base64 image and optional language
-
-    Returns:
-        OCRResponse with detected text, confidence scores, and bounding boxes
-    """
-    try:
-        # Decode image
-        image = decode_base64_image(request.image)
-
-        # Get OCR instance (use request language if provided)
-        if request.language and request.language != OCR_LANGUAGE:
-            ocr = get_ocr(request.language)
-        else:
-            ocr = get_ocr()
-
-        result = ocr.ocr(image, cls=True)
-
-        # Process results
-        results = process_ocr_result(result)
-
-        return OCRResponse(success=True, results=results)
-
-    except Exception as e:
-        logger.error(f"OCR processing failed: {e}")
-        return OCRResponse(success=False, results=[], error=str(e))
-
-
-@app.post("/ocr/upload", response_model=OCRResponse)
-async def ocr_upload(
-    img: UploadFile = File(...),
-    language: Optional[str] = Form(None)
-):
-    """
-    Perform OCR on an uploaded image file
-
-    Args:
-        img: Uploaded image file
-        language: Optional language code (default: env OCR_LANGUAGE)
-
-    Returns:
-        OCRResponse with detected text, confidence scores, and bounding boxes
-    """
-    try:
-        # Read image
-        contents = await img.read()
-        image = Image.open(io.BytesIO(contents))
-
-        # Convert to RGB if necessary
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-
-        image_array = np.array(image)
-
-        # Get OCR instance
-        if language and language != OCR_LANGUAGE:
-            ocr = get_ocr(language)
-        else:
-            ocr = get_ocr()
-
-        result = ocr.ocr(image_array, cls=True)
-
-        # Process results
-        results = process_ocr_result(result)
-
-        return OCRResponse(success=True, results=results)
-
-    except Exception as e:
-        logger.error(f"OCR processing failed: {e}")
-        return OCRResponse(success=False, results=[], error=str(e))
-
-
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=5000)
diff --git a/image_support_files/paddleocr_vl_server.py b/image_support_files/paddleocr_vl_server.py
new file mode 100644
index 0000000..13ba044
--- /dev/null
+++ b/image_support_files/paddleocr_vl_server.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""
+PaddleOCR-VL FastAPI Server (CPU variant)
+Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
+"""
+
+import os
+import io
+import base64
+import logging
+import time
+from typing import Optional, List, Any, Dict, Union
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import torch
+from PIL import Image
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Environment configuration
+SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
+SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
+MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
+
+# Device configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {DEVICE}")
+
+# Task prompts for PaddleOCR-VL
+TASK_PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "formula": "Formula Recognition:",
+    "chart": "Chart Recognition:",
+}
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="PaddleOCR-VL Server",
+    description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
+    version="1.0.0"
+)
+
+# Global model instances
+model = None
+processor = None
+
+
+# Request/Response models (OpenAI-compatible)
+class ImageUrl(BaseModel):
+    url: str
+
+
+class ContentItem(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Optional[ImageUrl] = None
+
+
+class Message(BaseModel):
+    role: str
+    content: Union[str, List[ContentItem]]
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str = "paddleocr-vl"
+    messages: List[Message]
+    temperature: Optional[float] = 0.0
+    max_tokens: Optional[int] = 4096
+
+
+class Choice(BaseModel):
+    index: int
+    message: Message
+    finish_reason: str
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Choice]
+    usage: Usage
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model: str
+    device: str
+
+
+def load_model():
+    """Load the PaddleOCR-VL model and processor"""
+    global model, processor
+
+    if model is not None:
+        return
+
+    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
+
+    from transformers import AutoModelForCausalLM, AutoProcessor
+
+    # Load processor
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    # Load model with appropriate settings for CPU/GPU
+    if DEVICE == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        ).to(DEVICE).eval()
+    else:
+        # CPU mode - use float32 for compatibility
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+        ).eval()
+
+    logger.info("PaddleOCR-VL model loaded successfully")
+
+
+def decode_image(image_source: str) -> Image.Image:
+    """Decode image from URL or base64"""
+    if image_source.startswith("data:"):
+        # Base64 encoded image
+        header, data = image_source.split(",", 1)
+        image_data = base64.b64decode(data)
+        return Image.open(io.BytesIO(image_data)).convert("RGB")
+    elif image_source.startswith("http://") or image_source.startswith("https://"):
+        # URL - fetch image
+        import httpx
+        response = httpx.get(image_source, timeout=30.0)
+        response.raise_for_status()
+        return Image.open(io.BytesIO(response.content)).convert("RGB")
+    else:
+        # Assume it's a file path or raw base64
+        try:
+            image_data = base64.b64decode(image_source)
+            return Image.open(io.BytesIO(image_data)).convert("RGB")
+        except:
+            # Try as file path
+            return Image.open(image_source).convert("RGB")
+
+
+def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
+    """Extract image and text prompt from message content"""
+    if isinstance(content, str):
+        return None, content
+
+    image = None
+    text = ""
+
+    for item in content:
+        if item.type == "image_url" and item.image_url:
+            image = decode_image(item.image_url.url)
+        elif item.type == "text" and item.text:
+            text = item.text
+
+    return image, text
+
+
+def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
+    """Generate response using PaddleOCR-VL"""
+    load_model()
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]
+        }
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+
+    if DEVICE == "cuda":
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+
+    with torch.inference_mode():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            use_cache=True
+        )
+
+    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+
+    # Extract the assistant's response (after the prompt)
+    if "assistant" in response.lower():
+        parts = response.split("assistant")
+        if len(parts) > 1:
+            response = parts[-1].strip()
+
+    return response
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Pre-load the model on startup"""
+    logger.info("Pre-loading PaddleOCR-VL model...")
+    try:
+        load_model()
+        logger.info("Model pre-loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to pre-load model: {e}")
+        # Don't fail startup - model will be loaded on first request
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy" if model is not None else "loading",
+        model=MODEL_NAME,
+        device=DEVICE
+    )
+
+
+@app.get("/v1/models")
+async def list_models():
+    """List available models (OpenAI-compatible)"""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "paddleocr-vl",
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "paddlepaddle"
+            }
+        ]
+    }
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    """
+    OpenAI-compatible chat completions endpoint for PaddleOCR-VL
+
+    Supports tasks:
+    - "OCR:" - Text recognition
+    - "Table Recognition:" - Table extraction
+    - "Formula Recognition:" - Formula extraction
+    - "Chart Recognition:" - Chart extraction
+    """
+    try:
+        # Get the last user message
+        user_message = None
+        for msg in reversed(request.messages):
+            if msg.role == "user":
+                user_message = msg
+                break
+
+        if not user_message:
+            raise HTTPException(status_code=400, detail="No user message found")
+
+        # Extract image and prompt
+        image, prompt = extract_image_and_text(user_message.content)
+
+        if image is None:
+            raise HTTPException(status_code=400, detail="No image provided in message")
+
+        # Default to OCR if no specific prompt
+        if not prompt or prompt.strip() == "":
+            prompt = "OCR:"
+
+        logger.info(f"Processing request with prompt: {prompt[:50]}...")
+
+        # Generate response
+        start_time = time.time()
+        response_text = generate_response(image, prompt, request.max_tokens or 4096)
+        elapsed = time.time() - start_time
+
+        logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
+
+        # Build OpenAI-compatible response
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{int(time.time()*1000)}",
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                Choice(
+                    index=0,
+                    message=Message(role="assistant", content=response_text),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=100,  # Approximate
+                completion_tokens=len(response_text) // 4,
+                total_tokens=100 + len(response_text) // 4
+            )
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# Legacy endpoint for compatibility with old PaddleOCR API
+class LegacyOCRRequest(BaseModel):
+    image: str
+    task: Optional[str] = "ocr"
+
+
+class LegacyOCRResponse(BaseModel):
+    success: bool
+    result: str
+    task: str
+    error: Optional[str] = None
+
+
+@app.post("/ocr", response_model=LegacyOCRResponse)
+async def legacy_ocr(request: LegacyOCRRequest):
+    """
+    Legacy OCR endpoint for backwards compatibility
+
+    Tasks: ocr, table, formula, chart
+    """
+    try:
+        image = decode_image(request.image)
+        prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
+
+        result = generate_response(image, prompt)
+
+        return LegacyOCRResponse(
+            success=True,
+            result=result,
+            task=request.task
+        )
+    except Exception as e:
+        logger.error(f"Legacy OCR error: {e}")
+        return LegacyOCRResponse(
+            success=False,
+            result="",
+            task=request.task,
+            error=str(e)
+        )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
diff --git a/readme.hints.md b/readme.hints.md
index e7abbc0..0ae6085 100644
--- a/readme.hints.md
+++ b/readme.hints.md
@@ -77,56 +77,73 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 
 CPU variant has longer `start-period` (120s) due to slower startup.
 
-## PaddleOCR
+## PaddleOCR-VL (Recommended)
 
 ### Overview
 
-PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides:
+PaddleOCR-VL is a 0.9B parameter Vision-Language Model specifically optimized for document parsing. It replaces the older PP-Structure approach with native VLM understanding.
 
-- Text detection and recognition
-- Multi-language support
-- FastAPI REST API
-- GPU and CPU variants
+**Key advantages over PP-Structure:**
+- Native table understanding (no HTML parsing needed)
+- 109 language support
+- Better handling of complex multi-row tables
+- Structured Markdown/JSON output
 
 ### Docker Images
 
 | Tag | Description |
 |-----|-------------|
-| `paddleocr` | GPU variant (default) |
-| `paddleocr-gpu` | GPU variant (alias) |
-| `paddleocr-cpu` | CPU-only variant |
+| `paddleocr-vl` | GPU variant using vLLM (recommended) |
+| `paddleocr-vl-cpu` | CPU variant using transformers |
 
-### API Endpoints
+### API Endpoints (OpenAI-compatible)
 
 | Endpoint | Method | Description |
 |----------|--------|-------------|
 | `/health` | GET | Health check with model info |
-| `/ocr` | POST | OCR with base64 image (JSON body) |
-| `/ocr/upload` | POST | OCR with file upload (multipart form) |
+| `/v1/models` | GET | List available models |
+| `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
+| `/ocr` | POST | Legacy OCR endpoint |
 
 ### Request/Response Format
 
-**POST /ocr (JSON)**
+**POST /v1/chat/completions (OpenAI-compatible)**
 ```json
 {
-  "image": "<base64-encoded-image>",
-  "language": "en"  // optional
+  "model": "paddleocr-vl",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+        {"type": "text", "text": "Table Recognition:"}
+      ]
+    }
+  ],
+  "temperature": 0.0,
+  "max_tokens": 8192
 }
 ```
 
-**POST /ocr/upload (multipart)**
-- `img`: image file
-- `language`: optional language code
+**Task Prompts:**
+- `"OCR:"` - Text recognition
+- `"Table Recognition:"` - Table extraction (returns markdown)
+- `"Formula Recognition:"` - Formula extraction
+- `"Chart Recognition:"` - Chart extraction
 
 **Response**
 ```json
 {
-  "success": true,
-  "results": [
+  "id": "chatcmpl-...",
+  "object": "chat.completion",
+  "choices": [
     {
-      "text": "Invoice #12345",
-      "confidence": 0.98,
-      "box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "| Date | Description | Amount |\n|---|---|---|\n| 2021-06-01 | GITLAB INC | -119.96 |"
+      },
+      "finish_reason": "stop"
     }
   ]
 }
@@ -136,19 +153,16 @@ PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It pr
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `OCR_LANGUAGE` | `en` | Default language for OCR |
-| `SERVER_PORT` | `5000` | Server port |
-| `SERVER_HOST` | `0.0.0.0` | Server host |
-| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only |
+| `MODEL_NAME` | `PaddlePaddle/PaddleOCR-VL` | Model to load |
+| `HOST` | `0.0.0.0` | Server host |
+| `PORT` | `8000` | Server port |
+| `MAX_BATCHED_TOKENS` | `16384` | vLLM max batch tokens |
+| `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory usage (0-1) |
 
 ### Performance
 
-- **GPU**: ~1-3 seconds per page
-- **CPU**: ~10-30 seconds per page
-
-### Supported Languages
-
-Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
+- **GPU (vLLM)**: ~2-5 seconds per page
+- **CPU**: ~30-60 seconds per page
 
 ---
 
@@ -193,6 +207,43 @@ npmci docker build
 npmci docker push code.foss.global
 ```
 
+## Multi-Pass Extraction Strategy
+
+The bank statement extraction uses a dual-VLM consensus approach:
+
+### Architecture: Dual-VLM Consensus
+
+| VLM | Model | Purpose |
+|-----|-------|---------|
+| **MiniCPM-V 4.5** | 8B params | Primary visual extraction |
+| **PaddleOCR-VL** | 0.9B params | Table-specialized extraction |
+
+### Extraction Strategy
+
+1. **Pass 1**: MiniCPM-V visual extraction (images → JSON)
+2. **Pass 2**: PaddleOCR-VL table recognition (images → markdown → JSON)
+3. **Consensus**: If Pass 1 == Pass 2 → Done (fast path)
+4. **Pass 3+**: MiniCPM-V visual if no consensus
+
+### Why Dual-VLM Works
+
+- **Different architectures**: Two independent models cross-check each other
+- **Specialized strengths**: PaddleOCR-VL optimized for tables, MiniCPM-V for general vision
+- **No structure loss**: Both VLMs see the original images directly
+- **Fast consensus**: Most documents complete in 2 passes when VLMs agree
+
+### Comparison vs Old PP-Structure Approach
+
+| Approach | Bank Statement Result | Issue |
+|----------|----------------------|-------|
+| MiniCPM-V Visual | 28 transactions ✓ | - |
+| PP-Structure HTML + Visual | 13 transactions ✗ | HTML merged rows incorrectly |
+| PaddleOCR-VL Table | 28 transactions ✓ | Native table understanding |
+
+**Key insight**: PP-Structure's HTML output loses structure for complex tables. PaddleOCR-VL's native VLM approach maintains table integrity.
+
+---
+
 ## Related Resources
 
 - [Ollama Documentation](https://ollama.ai/docs)
diff --git a/test/test.node.ts b/test/test.node.ts
index 00fa868..5025ce2 100644
--- a/test/test.node.ts
+++ b/test/test.node.ts
@@ -4,12 +4,16 @@ import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 
+// Service URLs
 const OLLAMA_URL = 'http://localhost:11434';
-const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';
 
-// Prompt for visual extraction (with images)
-const VISUAL_EXTRACT_PROMPT = `/nothink
+// Models
+const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
+const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
+
+// Prompt for MiniCPM-V visual extraction
+const MINICPM_EXTRACT_PROMPT = `/nothink
 You are a bank statement parser. Extract EVERY transaction from the table.
 
 Read the Amount column carefully:
@@ -21,9 +25,12 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 
 Do not skip any rows. Return ONLY the JSON array, no explanation.`;
 
-// Prompt for OCR-only extraction (no images)
-const OCR_EXTRACT_PROMPT = `/nothink
-You are a bank statement parser. Extract EVERY transaction from the OCR text below.
+// Prompt for PaddleOCR-VL table extraction
+const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
+
+// Post-processing prompt to convert PaddleOCR-VL output to JSON
+const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
+Convert the following bank statement table data to JSON.
 
 Read the Amount values carefully:
 - "- 21,47 €" means DEBIT, output as: -21.47
@@ -32,48 +39,12 @@ Read the Amount values carefully:
 
 For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
 
-Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
+Return ONLY the JSON array, no explanation.
 
-/**
- * Build prompt for OCR-only extraction (no images)
- */
-function buildOcrOnlyPrompt(ocrText: string): string {
-  // Limit OCR text to prevent context overflow
-  const maxOcrLength = 12000;
-  const truncatedOcr = ocrText.length > maxOcrLength
-    ? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
-    : ocrText;
-
-  return `${OCR_EXTRACT_PROMPT}
-
-OCR text from bank statement:
+Table data:
 ---
-${truncatedOcr}
+{TABLE_DATA}
 ---`;
-}
-
-/**
- * Extract OCR text from an image using PaddleOCR
- */
-async function extractOcrText(imageBase64: string): Promise<string> {
-  try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
-    });
-
-    if (!response.ok) return '';
-
-    const data = await response.json();
-    if (data.success && data.results) {
-      return data.results.map((r: { text: string }) => r.text).join('\n');
-    }
-  } catch {
-    // PaddleOCR unavailable
-  }
-  return '';
-}
 
 interface ITransaction {
   date: string;
@@ -94,7 +65,7 @@ function convertPdfToImages(pdfPath: string): string[] {
       { stdio: 'pipe' }
     );
 
-    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
+    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
     const images: string[] = [];
 
     for (const file of files) {
@@ -110,12 +81,12 @@ function convertPdfToImages(pdfPath: string): string[] {
 }
 
 /**
- * Visual extraction pass (with images)
+ * Extract using MiniCPM-V via Ollama
  */
-async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
+async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
   const payload = {
-    model: MODEL,
-    prompt: VISUAL_EXTRACT_PROMPT,
+    model: MINICPM_MODEL,
+    prompt: MINICPM_EXTRACT_PROMPT,
     images,
     stream: true,
     options: {
@@ -124,31 +95,6 @@ async function extractVisual(images: string[], passLabel: string): Promise<ITran
     },
   };
 
-  return doExtraction(payload, passLabel);
-}
-
-/**
- * OCR-only extraction pass (no images, just text)
- */
-async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
-  const payload = {
-    model: MODEL,
-    prompt: buildOcrOnlyPrompt(ocrText),
-    stream: true,
-    options: {
-      num_predict: 16384,
-      temperature: 0.1,
-    },
-  };
-
-  return doExtraction(payload, passLabel);
-}
-
-/**
- * Common extraction logic
- */
-async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
-
   const response = await fetch(`${OLLAMA_URL}/api/generate`, {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },
@@ -168,7 +114,7 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
   let fullText = '';
   let lineBuffer = '';
 
-  console.log(`[${passLabel}] Extracting...`);
+  console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
 
   while (true) {
     const { done, value } = await reader.read();
@@ -184,7 +130,6 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
           fullText += json.response;
           lineBuffer += json.response;
 
-          // Print complete lines
           if (lineBuffer.includes('\n')) {
             const parts = lineBuffer.split('\n');
             for (let i = 0; i < parts.length - 1; i++) {
@@ -214,6 +159,140 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
   return JSON.parse(fullText.substring(startIdx, endIdx));
 }
 
+/**
+ * Extract table using PaddleOCR-VL via OpenAI-compatible API
+ */
+async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
+  const payload = {
+    model: PADDLEOCR_VL_MODEL,
+    messages: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'image_url',
+            image_url: { url: `data:image/png;base64,${imageBase64}` },
+          },
+          {
+            type: 'text',
+            text: PADDLEOCR_VL_TABLE_PROMPT,
+          },
+        ],
+      },
+    ],
+    temperature: 0.0,
+    max_tokens: 8192,
+  };
+
+  const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(payload),
+  });
+
+  if (!response.ok) {
+    const text = await response.text();
+    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
+  }
+
+  const data = await response.json();
+  return data.choices?.[0]?.message?.content || '';
+}
+
+/**
+ * Convert PaddleOCR-VL table output to transactions using MiniCPM-V
+ */
+async function convertTableToTransactions(
+  tableData: string,
+  passLabel: string
+): Promise<ITransaction[]> {
+  const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
+
+  const payload = {
+    model: MINICPM_MODEL,
+    prompt,
+    stream: true,
+    options: {
+      num_predict: 16384,
+      temperature: 0.1,
+    },
+  };
+
+  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(payload),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Ollama API error: ${response.status}`);
+  }
+
+  const reader = response.body?.getReader();
+  if (!reader) {
+    throw new Error('No response body');
+  }
+
+  const decoder = new TextDecoder();
+  let fullText = '';
+
+  console.log(`[${passLabel}] Converting table data to JSON...`);
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    const chunk = decoder.decode(value, { stream: true });
+    const lines = chunk.split('\n').filter((l) => l.trim());
+
+    for (const line of lines) {
+      try {
+        const json = JSON.parse(line);
+        if (json.response) {
+          fullText += json.response;
+        }
+      } catch {
+        // Skip invalid JSON lines
+      }
+    }
+  }
+
+  const startIdx = fullText.indexOf('[');
+  const endIdx = fullText.lastIndexOf(']') + 1;
+
+  if (startIdx < 0 || endIdx <= startIdx) {
+    throw new Error('No JSON array found in response');
+  }
+
+  return JSON.parse(fullText.substring(startIdx, endIdx));
+}
+
+/**
+ * Extract using PaddleOCR-VL (table recognition) + conversion
+ */
+async function extractWithPaddleOCRVL(
+  images: string[],
+  passLabel: string
+): Promise<ITransaction[]> {
+  console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
+
+  // Extract table data from each page
+  const tableDataParts: string[] = [];
+  for (let i = 0; i < images.length; i++) {
+    console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
+    const tableData = await extractTableWithPaddleOCRVL(images[i]);
+    if (tableData.trim()) {
+      tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
+    }
+  }
+
+  const combinedTableData = tableDataParts.join('\n\n');
+  console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
+
+  // Convert to transactions
+  return convertTableToTransactions(combinedTableData, passLabel);
+}
+
 /**
  * Create a hash of transactions for comparison
  */
@@ -225,10 +304,31 @@ function hashTransactions(transactions: ITransaction[]): string {
 }
 
 /**
- * Extract with majority voting - run until 2 passes match
- * Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
+ * Check if PaddleOCR-VL service is available
  */
-async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
+async function isPaddleOCRVLAvailable(): Promise<boolean> {
+  try {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
+      method: 'GET',
+      signal: AbortSignal.timeout(5000),
+    });
+    return response.ok;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Extract with dual-VLM consensus
+ * Strategy:
+ *   Pass 1 = MiniCPM-V visual extraction
+ *   Pass 2 = PaddleOCR-VL table recognition (if available)
+ *   Pass 3+ = MiniCPM-V visual (fallback)
+ */
+async function extractWithConsensus(
+  images: string[],
+  maxPasses: number = 5
+): Promise<ITransaction[]> {
   const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
   const hashCounts: Map<string, number> = new Map();
 
@@ -236,59 +336,48 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
     const hash = hashTransactions(transactions);
     results.push({ transactions, hash });
     hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
-    console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
+    console.log(
+      `[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
+    );
     return hashCounts.get(hash)!;
   };
 
-  // Run Pass 1 (Visual) in parallel with OCR extraction
-  let ocrText = '';
-  const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));
-
-  // Extract OCR from all pages
-  const ocrPromise = (async () => {
-    const ocrTexts: string[] = [];
-    for (let i = 0; i < images.length; i++) {
-      const pageOcr = await extractOcrText(images[i]);
-      if (pageOcr) {
-        ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
-      }
-    }
-    ocrText = ocrTexts.join('\n\n');
-    if (ocrText) {
-      console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
-    }
-    return ocrText;
-  })();
-
-  // Wait for Pass 1 and OCR to complete
-  const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
-
-  // Process Pass 1 result
-  if ('error' in pass1Result) {
-    console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
+  // Check if PaddleOCR-VL is available
+  const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
+  if (paddleOCRVLAvailable) {
+    console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
   } else {
-    addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
+    console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
   }
 
-  // Pass 2: OCR-only (no images) - faster, different approach
-  if (ocrText) {
+  // Pass 1: MiniCPM-V visual extraction
+  try {
+    const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
+    addResult(pass1Result, 'Pass 1 MiniCPM-V');
+  } catch (err) {
+    console.log(`[Pass 1] Error: ${err}`);
+  }
+
+  // Pass 2: PaddleOCR-VL table recognition (if available)
+  if (paddleOCRVLAvailable) {
     try {
-      const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
-      const count = addResult(pass2Result, 'Pass 2 OCR-only');
+      const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
+      const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
       if (count >= 2) {
-        console.log(`[Consensus] Visual and OCR extractions match!`);
+        console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
         return pass2Result;
       }
     } catch (err) {
-      console.log(`[Pass 2 OCR-only] Error: ${err}`);
+      console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
     }
   }
 
-  // Continue with visual passes 3+ if no consensus yet
-  for (let pass = 3; pass <= maxPasses; pass++) {
+  // Pass 3+: Continue with MiniCPM-V visual passes
+  const startPass = paddleOCRVLAvailable ? 3 : 2;
+  for (let pass = startPass; pass <= maxPasses; pass++) {
     try {
-      const transactions = await extractVisual(images, `Pass ${pass} Visual`);
-      const count = addResult(transactions, `Pass ${pass} Visual`);
+      const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
+      const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
 
       if (count >= 2) {
         console.log(`[Consensus] Reached after ${pass} passes`);
@@ -368,7 +457,7 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
   }
 
   const files = fs.readdirSync(testDir);
-  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
+  const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
   const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
 
   for (const pdf of pdfFiles) {
@@ -402,6 +491,13 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
   expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
 });
 
+tap.test('should check PaddleOCR-VL availability', async () => {
+  const available = await isPaddleOCRVLAvailable();
+  console.log(`PaddleOCR-VL available: ${available}`);
+  // This test passes regardless - PaddleOCR-VL is optional
+  expect(true).toBeTrue();
+});
+
 // Dynamic test for each PDF/JSON pair
 const testCases = findTestCases();
 for (const testCase of testCases) {
@@ -416,7 +512,7 @@ for (const testCase of testCases) {
     const images = convertPdfToImages(testCase.pdfPath);
     console.log(`Converted: ${images.length} pages\n`);
 
-    // Extract with consensus voting
+    // Extract with dual-VLM consensus
     const extracted = await extractWithConsensus(images);
     console.log(`\nFinal: ${extracted.length} transactions`);
 
diff --git a/test/test.paddleocr.ts b/test/test.paddleocr.ts
deleted file mode 100644
index 9fe6fb2..0000000
--- a/test/test.paddleocr.ts
+++ /dev/null
@@ -1,258 +0,0 @@
-import { tap, expect } from '@git.zone/tstest/tapbundle';
-import * as fs from 'fs';
-import * as path from 'path';
-import { execSync } from 'child_process';
-import * as os from 'os';
-
-const PADDLEOCR_URL = 'http://localhost:5000';
-
-interface IOCRResult {
-  text: string;
-  confidence: number;
-  box: number[][];
-}
-
-interface IOCRResponse {
-  success: boolean;
-  results: IOCRResult[];
-  error?: string;
-}
-
-interface IHealthResponse {
-  status: string;
-  model: string;
-  language: string;
-  gpu_enabled: boolean;
-}
-
-/**
- * Convert PDF first page to PNG using ImageMagick
- */
-function convertPdfToImage(pdfPath: string): string {
-  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
-  const outputPath = path.join(tempDir, 'page.png');
-
-  try {
-    execSync(
-      `convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
-      { stdio: 'pipe' }
-    );
-
-    const imageData = fs.readFileSync(outputPath);
-    return imageData.toString('base64');
-  } finally {
-    fs.rmSync(tempDir, { recursive: true, force: true });
-  }
-}
-
-/**
- * Create a simple test image with text using ImageMagick
- */
-function createTestImage(text: string): string {
-  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
-  const outputPath = path.join(tempDir, 'test.png');
-
-  try {
-    execSync(
-      `convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
-      { stdio: 'pipe' }
-    );
-
-    const imageData = fs.readFileSync(outputPath);
-    return imageData.toString('base64');
-  } finally {
-    fs.rmSync(tempDir, { recursive: true, force: true });
-  }
-}
-
-// Health check test
-tap.test('should respond to health check', async () => {
-  const response = await fetch(`${PADDLEOCR_URL}/health`);
-  expect(response.ok).toBeTrue();
-
-  const data: IHealthResponse = await response.json();
-  expect(data.status).toEqual('healthy');
-  expect(data.model).toEqual('PP-OCRv4');
-  expect(data.language).toBeTypeofString();
-  expect(data.gpu_enabled).toBeTypeofBoolean();
-
-  console.log(`PaddleOCR Status: ${data.status}`);
-  console.log(`  Model: ${data.model}`);
-  console.log(`  Language: ${data.language}`);
-  console.log(`  GPU Enabled: ${data.gpu_enabled}`);
-});
-
-// Base64 OCR test
-tap.test('should perform OCR on base64 image', async () => {
-  // Create a test image with known text
-  const testText = 'Hello World 12345';
-  console.log(`Creating test image with text: "${testText}"`);
-
-  const imageBase64 = createTestImage(testText);
-
-  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ image: imageBase64 }),
-  });
-
-  expect(response.ok).toBeTrue();
-
-  const data: IOCRResponse = await response.json();
-  expect(data.success).toBeTrue();
-  expect(data.results).toBeArray();
-
-  const extractedText = data.results.map((r) => r.text).join(' ');
-  console.log(`Extracted text: "${extractedText}"`);
-
-  // Check that we got some text back
-  expect(data.results.length).toBeGreaterThan(0);
-
-  // Check that at least some of the expected text was found
-  const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
-  const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
-  const hasPartialMatch =
-    normalizedExtracted.includes('hello') ||
-    normalizedExtracted.includes('world') ||
-    normalizedExtracted.includes('12345');
-
-  expect(hasPartialMatch).toBeTrue();
-});
-
-// File upload OCR test
-tap.test('should perform OCR via file upload', async () => {
-  const testText = 'Invoice Number 98765';
-  console.log(`Creating test image with text: "${testText}"`);
-
-  const imageBase64 = createTestImage(testText);
-  const imageBuffer = Buffer.from(imageBase64, 'base64');
-
-  const formData = new FormData();
-  const blob = new Blob([imageBuffer], { type: 'image/png' });
-  formData.append('img', blob, 'test.png');
-
-  const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
-    method: 'POST',
-    body: formData,
-  });
-
-  expect(response.ok).toBeTrue();
-
-  const data: IOCRResponse = await response.json();
-  expect(data.success).toBeTrue();
-  expect(data.results).toBeArray();
-
-  const extractedText = data.results.map((r) => r.text).join(' ');
-  console.log(`Extracted text: "${extractedText}"`);
-
-  // Check that we got some text back
-  expect(data.results.length).toBeGreaterThan(0);
-});
-
-// OCR result structure test
-tap.test('should return proper OCR result structure', async () => {
-  const testText = 'Test 123';
-  const imageBase64 = createTestImage(testText);
-
-  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ image: imageBase64 }),
-  });
-
-  const data: IOCRResponse = await response.json();
-
-  if (data.results.length > 0) {
-    const result = data.results[0];
-
-    // Check result has required fields
-    expect(result.text).toBeTypeofString();
-    expect(result.confidence).toBeTypeofNumber();
-    expect(result.box).toBeArray();
-
-    // Check bounding box structure (4 points, each with x,y)
-    expect(result.box.length).toEqual(4);
-    for (const point of result.box) {
-      expect(point.length).toEqual(2);
-      expect(point[0]).toBeTypeofNumber();
-      expect(point[1]).toBeTypeofNumber();
-    }
-
-    // Confidence should be between 0 and 1
-    expect(result.confidence).toBeGreaterThan(0);
-    expect(result.confidence).toBeLessThanOrEqual(1);
-
-    console.log(`Result structure valid:`);
-    console.log(`  Text: "${result.text}"`);
-    console.log(`  Confidence: ${(result.confidence * 100).toFixed(1)}%`);
-    console.log(`  Box: ${JSON.stringify(result.box)}`);
-  }
-});
-
-// Test with actual invoice if available
-const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
-if (fs.existsSync(invoiceDir)) {
-  const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
-
-  if (pdfFiles.length > 0) {
-    const testPdf = pdfFiles[0];
-    tap.test(`should extract text from invoice: ${testPdf}`, async () => {
-      const pdfPath = path.join(invoiceDir, testPdf);
-      console.log(`Converting ${testPdf} to image...`);
-
-      const imageBase64 = convertPdfToImage(pdfPath);
-      console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
-
-      const startTime = Date.now();
-
-      const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ image: imageBase64 }),
-      });
-
-      const endTime = Date.now();
-      const elapsedMs = endTime - startTime;
-
-      expect(response.ok).toBeTrue();
-
-      const data: IOCRResponse = await response.json();
-      expect(data.success).toBeTrue();
-
-      console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
-      console.log(`Found ${data.results.length} text regions`);
-
-      // Print first 10 results
-      const preview = data.results.slice(0, 10);
-      console.log(`\nFirst ${preview.length} results:`);
-      for (const result of preview) {
-        console.log(`  [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
-      }
-
-      if (data.results.length > 10) {
-        console.log(`  ... and ${data.results.length - 10} more`);
-      }
-
-      // Should find text in an invoice
-      expect(data.results.length).toBeGreaterThan(5);
-    });
-  }
-}
-
-// Error handling test
-tap.test('should handle invalid base64 gracefully', async () => {
-  const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ image: 'not-valid-base64!!!' }),
-  });
-
-  const data: IOCRResponse = await response.json();
-
-  // Should return success: false with error message
-  expect(data.success).toBeFalse();
-  expect(data.error).toBeTypeofString();
-  console.log(`Error handling works: ${data.error}`);
-});
-
-export default tap.start();