feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests

2026-01-17 16:57:26 +00:00
parent 15ac1fcf67
commit 0482c35b69
9 changed files with 140 additions and 26 deletions
--- a/6
+++ b/6
@@ -39,11 +39,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
    torchvision \
    --index-url https://download.pytorch.org/whl/cu124

-# Install vLLM (nightly for PaddleOCR-VL support)
+# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
 RUN pip install --no-cache-dir \
-    vllm \
-    --pre \
-    --extra-index-url https://wheels.vllm.ai/nightly \
+    vllm==0.11.1 \
    --extra-index-url https://download.pytorch.org/whl/cu124

 # Install additional dependencies
--- a/7
+++ b/7
@@ -28,7 +28,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
-    torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \
+    torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
    pip install --no-cache-dir \
    transformers \
    accelerate \
@@ -37,7 +37,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
    fastapi \
    uvicorn[standard] \
    python-multipart \
-    httpx
+    httpx \
+    protobuf \
+    sentencepiece \
+    einops

 # Copy server files
 COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
--- a/71
+++ b/71
@@ -0,0 +1,71 @@
+# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
+# Vision-Language Model for document parsing using transformers with CUDA
+FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
+
+LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
+LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
+LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
+
+# Environment configuration
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/root/.cache/huggingface
+ENV SERVER_PORT=8000
+ENV SERVER_HOST=0.0.0.0
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+
+# Create and activate virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install PyTorch with CUDA support
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    torch==2.5.1 \
+    torchvision \
+    --index-url https://download.pytorch.org/whl/cu124
+
+# Install Python dependencies (transformers-based, not vLLM)
+RUN pip install --no-cache-dir \
+    transformers \
+    accelerate \
+    safetensors \
+    pillow \
+    fastapi \
+    uvicorn[standard] \
+    python-multipart \
+    httpx \
+    protobuf \
+    sentencepiece \
+    einops
+
+# Copy server files (same as CPU variant - it auto-detects CUDA)
+COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
+COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
+RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,15 @@
 # Changelog

+## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
+add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
+
+- Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
+- Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
+- Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
+- Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
+- Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
+- Add @types/node to package.json dependencies and tidy devDependencies ordering.
+
 ## 2026-01-16 - 1.4.0 - feat(invoices)
 add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors

--- a/image_support_files/paddleocr-vl-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -11,12 +11,16 @@ HOST="${HOST:-0.0.0.0}"
 PORT="${PORT:-8000}"
 MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
 GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
+ENFORCE_EAGER="${ENFORCE_EAGER:-false}"

 echo "Model: ${MODEL_NAME}"
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
 echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
+echo "Max model length: ${MAX_MODEL_LEN}"
+echo "Enforce eager: ${ENFORCE_EAGER}"
 echo ""

 # Check GPU availability
@@ -31,13 +35,25 @@ fi
 echo "Starting vLLM server..."
 echo "==================================="

-# Start vLLM server with PaddleOCR-VL
-exec vllm serve "${MODEL_NAME}" \
-    --trust-remote-code \
-    --host "${HOST}" \
-    --port "${PORT}" \
-    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
-    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
-    --no-enable-prefix-caching \
-    --mm-processor-cache-gb 0 \
+# Build vLLM command
+VLLM_ARGS=(
+    serve "${MODEL_NAME}"
+    --trust-remote-code
+    --host "${HOST}"
+    --port "${PORT}"
+    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
+    --max-model-len "${MAX_MODEL_LEN}"
+    --no-enable-prefix-caching
+    --mm-processor-cache-gb 0
    --served-model-name "paddleocr-vl"
+    --limit-mm-per-prompt '{"image": 1}'
+)
+
+# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
+if [ "${ENFORCE_EAGER}" = "true" ]; then
+    VLLM_ARGS+=(--enforce-eager)
+fi
+
+# Start vLLM server with PaddleOCR-VL
+exec vllm "${VLLM_ARGS[@]}"
--- a/package.json
+++ b/package.json
@@ -13,8 +13,8 @@
    "test": "tstest test/ --verbose"
  },
  "devDependencies": {
-    "@git.zone/tstest": "^1.0.90",
-    "@git.zone/tsrun": "^1.3.3"
+    "@git.zone/tsrun": "^1.3.3",
+    "@git.zone/tstest": "^1.0.90"
  },
  "repository": {
    "type": "git",
@@ -28,5 +28,8 @@
    "minicpm",
    "ollama",
    "multimodal"
-  ]
+  ],
+  "dependencies": {
+    "@types/node": "^25.0.9"
+  }
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -7,6 +7,10 @@ settings:
 importers:

  .:
+    dependencies:
+      '@types/node':
+        specifier: ^25.0.9
+        version: 25.0.9
    devDependencies:
      '@git.zone/tsrun':
        specifier: ^1.3.3
--- a/test/test.bankstatements.combined.ts
+++ b/test/test.bankstatements.combined.ts
--- a/test/test.invoices.combined.ts
+++ b/test/test.invoices.combined.ts
@@ -6,7 +6,7 @@ import * as os from 'os';

 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
-const PADDLEOCR_URL = 'http://localhost:5000';
+const PADDLEOCR_VL_URL = 'http://localhost:8000';

 interface IInvoice {
  invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
 }

 /**
- * Extract OCR text from an image using PaddleOCR
+ * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
 */
 async function extractOcrText(imageBase64: string): Promise<string> {
  try {
-    const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
+    const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ image: imageBase64 }),
+      body: JSON.stringify({
+        model: 'paddleocr-vl',
+        messages: [{
+          role: 'user',
+          content: [
+            { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
+            { type: 'text', text: 'OCR:' }
+          ]
+        }],
+        temperature: 0.0,
+        max_tokens: 4096
+      }),
    });

    if (!response.ok) return '';

    const data = await response.json();
-    if (data.success && data.results) {
-      return data.results.map((r: { text: string }) => r.text).join('\n');
-    }
+    return data.choices?.[0]?.message?.content || '';
  } catch {
-    // PaddleOCR unavailable
+    // PaddleOCR-VL unavailable
  }
  return '';
 }