From 0482c35b69def4688bcd8e72e74d0283339d1fdf Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Sat, 17 Jan 2026 16:57:26 +0000 Subject: [PATCH] feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests --- Dockerfile_paddleocr_vl | 6 +- Dockerfile_paddleocr_vl_cpu | 7 +- Dockerfile_paddleocr_vl_gpu | 71 +++++++++++++++++++ changelog.md | 10 +++ .../paddleocr-vl-entrypoint.sh | 34 ++++++--- package.json | 9 ++- pnpm-lock.yaml | 4 ++ ...ode.ts => test.bankstatements.combined.ts} | 0 ....invoices.ts => test.invoices.combined.ts} | 25 ++++--- 9 files changed, 140 insertions(+), 26 deletions(-) create mode 100644 Dockerfile_paddleocr_vl_gpu rename test/{test.node.ts => test.bankstatements.combined.ts} (100%) rename test/{test.invoices.ts => test.invoices.combined.ts} (95%) diff --git a/Dockerfile_paddleocr_vl b/Dockerfile_paddleocr_vl index 4be04e7..af995ab 100644 --- a/Dockerfile_paddleocr_vl +++ b/Dockerfile_paddleocr_vl @@ -39,11 +39,9 @@ RUN pip install --no-cache-dir --upgrade pip && \ torchvision \ --index-url https://download.pytorch.org/whl/cu124 -# Install vLLM (nightly for PaddleOCR-VL support) +# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support) RUN pip install --no-cache-dir \ - vllm \ - --pre \ - --extra-index-url https://wheels.vllm.ai/nightly \ + vllm==0.11.1 \ --extra-index-url https://download.pytorch.org/whl/cu124 # Install additional dependencies diff --git a/Dockerfile_paddleocr_vl_cpu b/Dockerfile_paddleocr_vl_cpu index 206c615..c5439ca 100644 --- a/Dockerfile_paddleocr_vl_cpu +++ b/Dockerfile_paddleocr_vl_cpu @@ -28,7 +28,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Install Python dependencies RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir \ - torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \ + torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \ pip install --no-cache-dir \ transformers \ accelerate \ @@ -37,7 +37,10 @@ RUN pip install --no-cache-dir --upgrade pip && \ fastapi \ uvicorn[standard] \ python-multipart \ - httpx + httpx \ + protobuf \ + sentencepiece \ + einops # Copy server files COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py diff --git a/Dockerfile_paddleocr_vl_gpu b/Dockerfile_paddleocr_vl_gpu new file mode 100644 index 0000000..7f9c821 --- /dev/null +++ b/Dockerfile_paddleocr_vl_gpu @@ -0,0 +1,71 @@ +# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM) +# Vision-Language Model for document parsing using transformers with CUDA +FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 + +LABEL maintainer="Task Venture Capital GmbH " +LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers" +LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai" + +# Environment configuration +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV HF_HOME=/root/.cache/huggingface +ENV SERVER_PORT=8000 +ENV SERVER_HOST=0.0.0.0 + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + python3-pip \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libgomp1 \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 + +# Create and activate virtual environment +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install PyTorch with CUDA support +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + torch==2.5.1 \ + torchvision \ + --index-url https://download.pytorch.org/whl/cu124 + +# Install Python dependencies (transformers-based, not vLLM) +RUN pip install --no-cache-dir \ + transformers \ + accelerate \ + safetensors \ + pillow \ + fastapi \ + uvicorn[standard] \ + python-multipart \ + httpx \ + protobuf \ + sentencepiece \ + einops + +# Copy server files (same as CPU variant - it auto-detects CUDA) +COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py +COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh +RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh + +# Expose API port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"] diff --git a/changelog.md b/changelog.md index e49655a..f990640 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-17 - 1.5.0 - feat(paddleocr-vl) +add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests + +- Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint. +- Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support. +- Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server. +- Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args. +- Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'. +- Add @types/node to package.json dependencies and tidy devDependencies ordering. + ## 2026-01-16 - 1.4.0 - feat(invoices) add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors diff --git a/image_support_files/paddleocr-vl-entrypoint.sh b/image_support_files/paddleocr-vl-entrypoint.sh index 1978b9a..0b5e661 100644 --- a/image_support_files/paddleocr-vl-entrypoint.sh +++ b/image_support_files/paddleocr-vl-entrypoint.sh @@ -11,12 +11,16 @@ HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" +ENFORCE_EAGER="${ENFORCE_EAGER:-false}" echo "Model: ${MODEL_NAME}" echo "Host: ${HOST}" echo "Port: ${PORT}" echo "Max batched tokens: ${MAX_BATCHED_TOKENS}" echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}" +echo "Max model length: ${MAX_MODEL_LEN}" +echo "Enforce eager: ${ENFORCE_EAGER}" echo "" # Check GPU availability @@ -31,13 +35,25 @@ fi echo "Starting vLLM server..." echo "===================================" -# Start vLLM server with PaddleOCR-VL -exec vllm serve "${MODEL_NAME}" \ - --trust-remote-code \ - --host "${HOST}" \ - --port "${PORT}" \ - --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \ - --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \ - --no-enable-prefix-caching \ - --mm-processor-cache-gb 0 \ +# Build vLLM command +VLLM_ARGS=( + serve "${MODEL_NAME}" + --trust-remote-code + --host "${HOST}" + --port "${PORT}" + --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" + --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" + --max-model-len "${MAX_MODEL_LEN}" + --no-enable-prefix-caching + --mm-processor-cache-gb 0 --served-model-name "paddleocr-vl" + --limit-mm-per-prompt '{"image": 1}' +) + +# Add enforce-eager if enabled (disables CUDA graphs, saves memory) +if [ "${ENFORCE_EAGER}" = "true" ]; then + VLLM_ARGS+=(--enforce-eager) +fi + +# Start vLLM server with PaddleOCR-VL +exec vllm "${VLLM_ARGS[@]}" diff --git a/package.json b/package.json index 592a1a2..4d292b7 100644 --- a/package.json +++ b/package.json @@ -13,8 +13,8 @@ "test": "tstest test/ --verbose" }, "devDependencies": { - "@git.zone/tstest": "^1.0.90", - "@git.zone/tsrun": "^1.3.3" + "@git.zone/tsrun": "^1.3.3", + "@git.zone/tstest": "^1.0.90" }, "repository": { "type": "git", @@ -28,5 +28,8 @@ "minicpm", "ollama", "multimodal" - ] + ], + "dependencies": { + "@types/node": "^25.0.9" + } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 66f2a8b..2b1eeac 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -7,6 +7,10 @@ settings: importers: .: + dependencies: + '@types/node': + specifier: ^25.0.9 + version: 25.0.9 devDependencies: '@git.zone/tsrun': specifier: ^1.3.3 diff --git a/test/test.node.ts b/test/test.bankstatements.combined.ts similarity index 100% rename from test/test.node.ts rename to test/test.bankstatements.combined.ts diff --git a/test/test.invoices.ts b/test/test.invoices.combined.ts similarity index 95% rename from test/test.invoices.ts rename to test/test.invoices.combined.ts index e57e644..8ef8cdc 100644 --- a/test/test.invoices.ts +++ b/test/test.invoices.combined.ts @@ -6,7 +6,7 @@ import * as os from 'os'; const OLLAMA_URL = 'http://localhost:11434'; const MODEL = 'openbmb/minicpm-v4.5:q8_0'; -const PADDLEOCR_URL = 'http://localhost:5000'; +const PADDLEOCR_VL_URL = 'http://localhost:8000'; interface IInvoice { invoice_number: string; @@ -19,24 +19,33 @@ interface IInvoice { } /** - * Extract OCR text from an image using PaddleOCR + * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API) */ async function extractOcrText(imageBase64: string): Promise { try { - const response = await fetch(`${PADDLEOCR_URL}/ocr`, { + const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ image: imageBase64 }), + body: JSON.stringify({ + model: 'paddleocr-vl', + messages: [{ + role: 'user', + content: [ + { type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } }, + { type: 'text', text: 'OCR:' } + ] + }], + temperature: 0.0, + max_tokens: 4096 + }), }); if (!response.ok) return ''; const data = await response.json(); - if (data.success && data.results) { - return data.results.map((r: { text: string }) => r.text).join('\n'); - } + return data.choices?.[0]?.message?.content || ''; } catch { - // PaddleOCR unavailable + // PaddleOCR-VL unavailable } return ''; }