#!/bin/bash set -e echo "===================================" echo "PaddleOCR-VL Server" echo "===================================" # Configuration MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}" HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" ENFORCE_EAGER="${ENFORCE_EAGER:-false}" echo "Model: ${MODEL_NAME}" echo "Host: ${HOST}" echo "Port: ${PORT}" echo "Max batched tokens: ${MAX_BATCHED_TOKENS}" echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}" echo "Max model length: ${MAX_MODEL_LEN}" echo "Enforce eager: ${ENFORCE_EAGER}" echo "" # Check GPU availability if command -v nvidia-smi &> /dev/null; then echo "GPU Information:" nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv echo "" else echo "WARNING: nvidia-smi not found. GPU may not be available." fi echo "Starting vLLM server..." echo "===================================" # Build vLLM command VLLM_ARGS=( serve "${MODEL_NAME}" --trust-remote-code --host "${HOST}" --port "${PORT}" --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" --max-model-len "${MAX_MODEL_LEN}" --no-enable-prefix-caching --mm-processor-cache-gb 0 --served-model-name "paddleocr-vl" --limit-mm-per-prompt '{"image": 1}' ) # Add enforce-eager if enabled (disables CUDA graphs, saves memory) if [ "${ENFORCE_EAGER}" = "true" ]; then VLLM_ARGS+=(--enforce-eager) fi # Start vLLM server with PaddleOCR-VL exec vllm "${VLLM_ARGS[@]}"