feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests

This commit is contained in:
2026-01-17 16:57:26 +00:00
parent 15ac1fcf67
commit 0482c35b69
9 changed files with 140 additions and 26 deletions

View File

@@ -39,11 +39,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
torchvision \
--index-url https://download.pytorch.org/whl/cu124
# Install vLLM (nightly for PaddleOCR-VL support)
# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
RUN pip install --no-cache-dir \
vllm \
--pre \
--extra-index-url https://wheels.vllm.ai/nightly \
vllm==0.11.1 \
--extra-index-url https://download.pytorch.org/whl/cu124
# Install additional dependencies

View File

@@ -28,7 +28,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# Install Python dependencies
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \
torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir \
transformers \
accelerate \
@@ -37,7 +37,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
fastapi \
uvicorn[standard] \
python-multipart \
httpx
httpx \
protobuf \
sentencepiece \
einops
# Copy server files
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py

View File

@@ -0,0 +1,71 @@
# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
# Vision-Language Model for document parsing using transformers with CUDA
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV SERVER_PORT=8000
ENV SERVER_HOST=0.0.0.0
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3.11-dev \
python3-pip \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
git \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
# Create and activate virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 \
torchvision \
--index-url https://download.pytorch.org/whl/cu124
# Install Python dependencies (transformers-based, not vLLM)
RUN pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
httpx \
protobuf \
sentencepiece \
einops
# Copy server files (same as CPU variant - it auto-detects CUDA)
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]

View File

@@ -1,5 +1,15 @@
# Changelog
## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
- Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
- Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
- Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
- Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
- Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
- Add @types/node to package.json dependencies and tidy devDependencies ordering.
## 2026-01-16 - 1.4.0 - feat(invoices)
add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors

View File

@@ -11,12 +11,16 @@ HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
echo "Model: ${MODEL_NAME}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
echo "Max model length: ${MAX_MODEL_LEN}"
echo "Enforce eager: ${ENFORCE_EAGER}"
echo ""
# Check GPU availability
@@ -31,13 +35,25 @@ fi
echo "Starting vLLM server..."
echo "==================================="
# Start vLLM server with PaddleOCR-VL
exec vllm serve "${MODEL_NAME}" \
--trust-remote-code \
--host "${HOST}" \
--port "${PORT}" \
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
# Build vLLM command
VLLM_ARGS=(
serve "${MODEL_NAME}"
--trust-remote-code
--host "${HOST}"
--port "${PORT}"
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
--max-model-len "${MAX_MODEL_LEN}"
--no-enable-prefix-caching
--mm-processor-cache-gb 0
--served-model-name "paddleocr-vl"
--limit-mm-per-prompt '{"image": 1}'
)
# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
if [ "${ENFORCE_EAGER}" = "true" ]; then
VLLM_ARGS+=(--enforce-eager)
fi
# Start vLLM server with PaddleOCR-VL
exec vllm "${VLLM_ARGS[@]}"

View File

@@ -13,8 +13,8 @@
"test": "tstest test/ --verbose"
},
"devDependencies": {
"@git.zone/tstest": "^1.0.90",
"@git.zone/tsrun": "^1.3.3"
"@git.zone/tsrun": "^1.3.3",
"@git.zone/tstest": "^1.0.90"
},
"repository": {
"type": "git",
@@ -28,5 +28,8 @@
"minicpm",
"ollama",
"multimodal"
]
],
"dependencies": {
"@types/node": "^25.0.9"
}
}

4
pnpm-lock.yaml generated
View File

@@ -7,6 +7,10 @@ settings:
importers:
.:
dependencies:
'@types/node':
specifier: ^25.0.9
version: 25.0.9
devDependencies:
'@git.zone/tsrun':
specifier: ^1.3.3

View File

@@ -6,7 +6,7 @@ import * as os from 'os';
const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
const PADDLEOCR_URL = 'http://localhost:5000';
const PADDLEOCR_VL_URL = 'http://localhost:8000';
interface IInvoice {
invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
}
/**
* Extract OCR text from an image using PaddleOCR
* Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
*/
async function extractOcrText(imageBase64: string): Promise<string> {
try {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
body: JSON.stringify({
model: 'paddleocr-vl',
messages: [{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
{ type: 'text', text: 'OCR:' }
]
}],
temperature: 0.0,
max_tokens: 4096
}),
});
if (!response.ok) return '';
const data = await response.json();
if (data.success && data.results) {
return data.results.map((r: { text: string }) => r.text).join('\n');
}
return data.choices?.[0]?.message?.content || '';
} catch {
// PaddleOCR unavailable
// PaddleOCR-VL unavailable
}
return '';
}