feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests

This commit is contained in:
2026-01-17 16:57:26 +00:00
parent 15ac1fcf67
commit 0482c35b69
9 changed files with 140 additions and 26 deletions

View File

@@ -11,12 +11,16 @@ HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
echo "Model: ${MODEL_NAME}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
echo "Max model length: ${MAX_MODEL_LEN}"
echo "Enforce eager: ${ENFORCE_EAGER}"
echo ""
# Check GPU availability
@@ -31,13 +35,25 @@ fi
echo "Starting vLLM server..."
echo "==================================="
# Start vLLM server with PaddleOCR-VL
exec vllm serve "${MODEL_NAME}" \
--trust-remote-code \
--host "${HOST}" \
--port "${PORT}" \
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
# Build vLLM command
VLLM_ARGS=(
serve "${MODEL_NAME}"
--trust-remote-code
--host "${HOST}"
--port "${PORT}"
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
--max-model-len "${MAX_MODEL_LEN}"
--no-enable-prefix-caching
--mm-processor-cache-gb 0
--served-model-name "paddleocr-vl"
--limit-mm-per-prompt '{"image": 1}'
)
# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
if [ "${ENFORCE_EAGER}" = "true" ]; then
VLLM_ARGS+=(--enforce-eager)
fi
# Start vLLM server with PaddleOCR-VL
exec vllm "${VLLM_ARGS[@]}"