From 0482c35b69def4688bcd8e72e74d0283339d1fdf Mon Sep 17 00:00:00 2001
From: Juergen Kunz <juergen@foss.global>
Date: Sat, 17 Jan 2026 16:57:26 +0000
Subject: [PATCH] feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin
 vllm, update CPU image deps, and improve entrypoint and tests

---
 Dockerfile_paddleocr_vl                       |  6 +-
 Dockerfile_paddleocr_vl_cpu                   |  7 +-
 Dockerfile_paddleocr_vl_gpu                   | 71 +++++++++++++++++++
 changelog.md                                  | 10 +++
 .../paddleocr-vl-entrypoint.sh                | 34 ++++++---
 package.json                                  |  9 ++-
 pnpm-lock.yaml                                |  4 ++
 ...ode.ts => test.bankstatements.combined.ts} |  0
 ....invoices.ts => test.invoices.combined.ts} | 25 ++++---
 9 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 Dockerfile_paddleocr_vl_gpu
 rename test/{test.node.ts => test.bankstatements.combined.ts} (100%)
 rename test/{test.invoices.ts => test.invoices.combined.ts} (95%)

diff --git a/Dockerfile_paddleocr_vl b/Dockerfile_paddleocr_vl
index 4be04e7..af995ab 100644
--- a/Dockerfile_paddleocr_vl
+++ b/Dockerfile_paddleocr_vl
@@ -39,11 +39,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
     torchvision \
     --index-url https://download.pytorch.org/whl/cu124
 
-# Install vLLM (nightly for PaddleOCR-VL support)
+# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
 RUN pip install --no-cache-dir \
-    vllm \
-    --pre \
-    --extra-index-url https://wheels.vllm.ai/nightly \
+    vllm==0.11.1 \
     --extra-index-url https://download.pytorch.org/whl/cu124
 
 # Install additional dependencies
diff --git a/Dockerfile_paddleocr_vl_cpu b/Dockerfile_paddleocr_vl_cpu
index 206c615..c5439ca 100644
--- a/Dockerfile_paddleocr_vl_cpu
+++ b/Dockerfile_paddleocr_vl_cpu
@@ -28,7 +28,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir \
-    torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \
+    torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
     pip install --no-cache-dir \
     transformers \
     accelerate \
@@ -37,7 +37,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
     fastapi \
     uvicorn[standard] \
     python-multipart \
-    httpx
+    httpx \
+    protobuf \
+    sentencepiece \
+    einops
 
 # Copy server files
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
diff --git a/Dockerfile_paddleocr_vl_gpu b/Dockerfile_paddleocr_vl_gpu
new file mode 100644
index 0000000..7f9c821
--- /dev/null
+++ b/Dockerfile_paddleocr_vl_gpu
@@ -0,0 +1,71 @@
+# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
+# Vision-Language Model for document parsing using transformers with CUDA
+FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
+
+LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
+LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
+LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
+
+# Environment configuration
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/root/.cache/huggingface
+ENV SERVER_PORT=8000
+ENV SERVER_HOST=0.0.0.0
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+
+# Create and activate virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install PyTorch with CUDA support
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    torch==2.5.1 \
+    torchvision \
+    --index-url https://download.pytorch.org/whl/cu124
+
+# Install Python dependencies (transformers-based, not vLLM)
+RUN pip install --no-cache-dir \
+    transformers \
+    accelerate \
+    safetensors \
+    pillow \
+    fastapi \
+    uvicorn[standard] \
+    python-multipart \
+    httpx \
+    protobuf \
+    sentencepiece \
+    einops
+
+# Copy server files (same as CPU variant - it auto-detects CUDA)
+COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
+COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
+RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
diff --git a/changelog.md b/changelog.md
index e49655a..f990640 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
+add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
+
+- Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
+- Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
+- Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
+- Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
+- Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
+- Add @types/node to package.json dependencies and tidy devDependencies ordering.
+
 ## 2026-01-16 - 1.4.0 - feat(invoices)
 add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors
 
diff --git a/image_support_files/paddleocr-vl-entrypoint.sh b/image_support_files/paddleocr-vl-entrypoint.sh
index 1978b9a..0b5e661 100644
--- a/image_support_files/paddleocr-vl-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -11,12 +11,16 @@ HOST="${HOST:-0.0.0.0}"
 PORT="${PORT:-8000}"
 MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
 GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
+ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
 
 echo "Model: ${MODEL_NAME}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
 echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
+echo "Max model length: ${MAX_MODEL_LEN}"
+echo "Enforce eager: ${ENFORCE_EAGER}"
 echo ""
 
 # Check GPU availability
@@ -31,13 +35,25 @@ fi
 echo "Starting vLLM server..."
 echo "==================================="
 
-# Start vLLM server with PaddleOCR-VL
-exec vllm serve "${MODEL_NAME}" \
-    --trust-remote-code \
-    --host "${HOST}" \
-    --port "${PORT}" \
-    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
-    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
-    --no-enable-prefix-caching \
-    --mm-processor-cache-gb 0 \
+# Build vLLM command
+VLLM_ARGS=(
+    serve "${MODEL_NAME}"
+    --trust-remote-code
+    --host "${HOST}"
+    --port "${PORT}"
+    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
+    --max-model-len "${MAX_MODEL_LEN}"
+    --no-enable-prefix-caching
+    --mm-processor-cache-gb 0
     --served-model-name "paddleocr-vl"
+    --limit-mm-per-prompt '{"image": 1}'
+)
+
+# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
+if [ "${ENFORCE_EAGER}" = "true" ]; then
+    VLLM_ARGS+=(--enforce-eager)
+fi
+
+# Start vLLM server with PaddleOCR-VL
+exec vllm "${VLLM_ARGS[@]}"
diff --git a/package.json b/package.json
index 592a1a2..4d292b7 100644
--- a/package.json
+++ b/package.json
@@ -13,8 +13,8 @@
     "test": "tstest test/ --verbose"
   },
   "devDependencies": {
-    "@git.zone/tstest": "^1.0.90",
-    "@git.zone/tsrun": "^1.3.3"
+    "@git.zone/tsrun": "^1.3.3",
+    "@git.zone/tstest": "^1.0.90"
   },
   "repository": {
     "type": "git",
@@ -28,5 +28,8 @@
     "minicpm",
     "ollama",
     "multimodal"
-  ]
+  ],
+  "dependencies": {
+    "@types/node": "^25.0.9"
+  }
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 66f2a8b..2b1eeac 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -7,6 +7,10 @@ settings:
 importers:
 
   .:
+    dependencies:
+      '@types/node':
+        specifier: ^25.0.9
+        version: 25.0.9
     devDependencies:
       '@git.zone/tsrun':
         specifier: ^1.3.3
diff --git a/test/test.node.ts b/test/test.bankstatements.combined.ts
similarity index 100%
rename from test/test.node.ts
rename to test/test.bankstatements.combined.ts
diff --git a/test/test.invoices.ts b/test/test.invoices.combined.ts
similarity index 95%
rename from test/test.invoices.ts
rename to test/test.invoices.combined.ts
index e57e644..8ef8cdc 100644
--- a/test/test.invoices.ts
+++ b/test/test.invoices.combined.ts
@@ -6,7 +6,7 @@ import * as os from 'os';
 
 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';
 
 interface IInvoice {
   invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
 }
 
 /**
- * Extract OCR text from an image using PaddleOCR
+ * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
  */
 async function extractOcrText(imageBase64: string): Promise<string> {
   try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
+      body: JSON.stringify({
+        model: 'paddleocr-vl',
+        messages: [{
+          role: 'user',
+          content: [
+            { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
+            { type: 'text', text: 'OCR:' }
+          ]
+        }],
+        temperature: 0.0,
+        max_tokens: 4096
+      }),
     });
 
     if (!response.ok) return '';
 
     const data = await response.json();
-    if (data.success && data.results) {
-      return data.results.map((r: { text: string }) => r.text).join('\n');
-    }
+    return data.choices?.[0]?.message?.content || '';
   } catch {
-    // PaddleOCR unavailable
+    // PaddleOCR-VL unavailable
   }
   return '';
 }