#!/bin/bash set -e echo "===================================" echo "PaddleOCR-VL Server" echo "===================================" # Configuration MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}" HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}" echo "Model: ${MODEL_NAME}" echo "Host: ${HOST}" echo "Port: ${PORT}" echo "Max batched tokens: ${MAX_BATCHED_TOKENS}" echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}" echo "" # Check GPU availability if command -v nvidia-smi &> /dev/null; then echo "GPU Information:" nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv echo "" else echo "WARNING: nvidia-smi not found. GPU may not be available." fi echo "Starting vLLM server..." echo "===================================" # Start vLLM server with PaddleOCR-VL exec vllm serve "${MODEL_NAME}" \ --trust-remote-code \ --host "${HOST}" \ --port "${PORT}" \ --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \ --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \ --no-enable-prefix-caching \ --mm-processor-cache-gb 0 \ --served-model-name "paddleocr-vl"