This commit is contained in:
2026-01-16 16:21:44 +00:00
parent 3c5cf578a5
commit 15ac1fcf67
13 changed files with 873 additions and 805 deletions

View File

@@ -1,25 +0,0 @@
#!/bin/bash
set -e
# Configuration from environment
OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
SERVER_PORT="${SERVER_PORT:-5000}"
SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
echo "Starting PaddleOCR Server..."
echo " Language: ${OCR_LANGUAGE}"
echo " Host: ${SERVER_HOST}"
echo " Port: ${SERVER_PORT}"
# Check GPU availability
if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
echo " GPU: Disabled (CPU mode)"
else
echo " GPU: Enabled"
fi
# Start the FastAPI server with uvicorn
exec python -m uvicorn paddleocr_server:app \
--host "${SERVER_HOST}" \
--port "${SERVER_PORT}" \
--workers 1

View File

@@ -0,0 +1,19 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server (CPU)"
echo "==================================="
HOST="${SERVER_HOST:-0.0.0.0}"
PORT="${SERVER_PORT:-8000}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Device: CPU (no GPU)"
echo ""
echo "Starting PaddleOCR-VL CPU server..."
echo "==================================="
exec python /app/paddleocr_vl_server.py

View File

@@ -0,0 +1,43 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server"
echo "==================================="
# Configuration
MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
echo "Model: ${MODEL_NAME}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
echo ""
# Check GPU availability
if command -v nvidia-smi &> /dev/null; then
echo "GPU Information:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
echo ""
else
echo "WARNING: nvidia-smi not found. GPU may not be available."
fi
echo "Starting vLLM server..."
echo "==================================="
# Start vLLM server with PaddleOCR-VL
exec vllm serve "${MODEL_NAME}" \
--trust-remote-code \
--host "${HOST}" \
--port "${PORT}" \
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--served-model-name "paddleocr-vl"

View File

@@ -1,253 +0,0 @@
#!/usr/bin/env python3
"""
PaddleOCR FastAPI Server
Provides REST API for OCR operations using PaddleOCR
"""
import os
import io
import base64
import logging
from typing import Optional, List, Any
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR Server",
description="REST API for OCR operations using PaddleOCR PP-OCRv4",
version="1.0.0"
)
# Global OCR instance
ocr_instance: Optional[PaddleOCR] = None
class OCRRequest(BaseModel):
"""Request model for base64 image OCR"""
image: str
language: Optional[str] = None
class BoundingBox(BaseModel):
"""Bounding box for detected text"""
points: List[List[float]]
class OCRResult(BaseModel):
"""Single OCR detection result"""
text: str
confidence: float
box: List[List[float]]
class OCRResponse(BaseModel):
"""OCR response model"""
success: bool
results: List[OCRResult]
error: Optional[str] = None
class HealthResponse(BaseModel):
"""Health check response"""
status: str
model: str
language: str
gpu_enabled: bool
def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
"""Get or initialize the OCR instance"""
global ocr_instance
use_lang = lang or OCR_LANGUAGE
# Return cached instance if same language
if ocr_instance is not None and lang is None:
return ocr_instance
logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
new_ocr = PaddleOCR(
use_angle_cls=True,
lang=use_lang,
use_gpu=USE_GPU,
show_log=False
)
# Cache the default language instance
if lang is None:
ocr_instance = new_ocr
logger.info("PaddleOCR initialized successfully")
return new_ocr
def decode_base64_image(base64_string: str) -> np.ndarray:
"""Decode base64 string to numpy array"""
# Remove data URL prefix if present
if ',' in base64_string:
base64_string = base64_string.split(',')[1]
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
return np.array(image)
def process_ocr_result(result: Any) -> List[OCRResult]:
"""Process PaddleOCR result into structured format"""
results = []
if result is None or len(result) == 0:
return results
# PaddleOCR returns list of results per image
# Each result is a list of [box, (text, confidence)]
for line in result[0] if result[0] else []:
if line is None:
continue
box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
text_info = line[1] # (text, confidence)
results.append(OCRResult(
text=text_info[0],
confidence=float(text_info[1]),
box=[[float(p[0]), float(p[1])] for p in box]
))
return results
@app.on_event("startup")
async def startup_event():
"""Pre-warm the OCR model on startup"""
logger.info("Pre-warming OCR model...")
try:
ocr = get_ocr()
# Create a small test image to warm up the model
test_image = np.zeros((100, 100, 3), dtype=np.uint8)
test_image.fill(255) # White image
ocr.ocr(test_image, cls=True)
logger.info("OCR model pre-warmed successfully")
except Exception as e:
logger.error(f"Failed to pre-warm OCR model: {e}")
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
try:
# Ensure OCR is initialized
get_ocr()
return HealthResponse(
status="healthy",
model="PP-OCRv4",
language=OCR_LANGUAGE,
gpu_enabled=USE_GPU
)
except Exception as e:
logger.error(f"Health check failed: {e}")
raise HTTPException(status_code=503, detail=str(e))
@app.post("/ocr", response_model=OCRResponse)
async def ocr_base64(request: OCRRequest):
"""
Perform OCR on a base64-encoded image
Args:
request: OCRRequest with base64 image and optional language
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Decode image
image = decode_base64_image(request.image)
# Get OCR instance (use request language if provided)
if request.language and request.language != OCR_LANGUAGE:
ocr = get_ocr(request.language)
else:
ocr = get_ocr()
result = ocr.ocr(image, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
@app.post("/ocr/upload", response_model=OCRResponse)
async def ocr_upload(
img: UploadFile = File(...),
language: Optional[str] = Form(None)
):
"""
Perform OCR on an uploaded image file
Args:
img: Uploaded image file
language: Optional language code (default: env OCR_LANGUAGE)
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Read image
contents = await img.read()
image = Image.open(io.BytesIO(contents))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
image_array = np.array(image)
# Get OCR instance
if language and language != OCR_LANGUAGE:
ocr = get_ocr(language)
else:
ocr = get_ocr()
result = ocr.ocr(image_array, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=5000)

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python3
"""
PaddleOCR-VL FastAPI Server (CPU variant)
Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
"""
import os
import io
import base64
import logging
import time
from typing import Optional, List, Any, Dict, Union
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch
from PIL import Image
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
# Task prompts for PaddleOCR-VL
TASK_PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR-VL Server",
description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
version="1.0.0"
)
# Global model instances
model = None
processor = None
# Request/Response models (OpenAI-compatible)
class ImageUrl(BaseModel):
url: str
class ContentItem(BaseModel):
type: str
text: Optional[str] = None
image_url: Optional[ImageUrl] = None
class Message(BaseModel):
role: str
content: Union[str, List[ContentItem]]
class ChatCompletionRequest(BaseModel):
model: str = "paddleocr-vl"
messages: List[Message]
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 4096
class Choice(BaseModel):
index: int
message: Message
finish_reason: str
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
usage: Usage
class HealthResponse(BaseModel):
status: str
model: str
device: str
def load_model():
"""Load the PaddleOCR-VL model and processor"""
global model, processor
if model is not None:
return
logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
from transformers import AutoModelForCausalLM, AutoProcessor
# Load processor
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Load model with appropriate settings for CPU/GPU
if DEVICE == "cuda":
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
else:
# CPU mode - use float32 for compatibility
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
).eval()
logger.info("PaddleOCR-VL model loaded successfully")
def decode_image(image_source: str) -> Image.Image:
"""Decode image from URL or base64"""
if image_source.startswith("data:"):
# Base64 encoded image
header, data = image_source.split(",", 1)
image_data = base64.b64decode(data)
return Image.open(io.BytesIO(image_data)).convert("RGB")
elif image_source.startswith("http://") or image_source.startswith("https://"):
# URL - fetch image
import httpx
response = httpx.get(image_source, timeout=30.0)
response.raise_for_status()
return Image.open(io.BytesIO(response.content)).convert("RGB")
else:
# Assume it's a file path or raw base64
try:
image_data = base64.b64decode(image_source)
return Image.open(io.BytesIO(image_data)).convert("RGB")
except:
# Try as file path
return Image.open(image_source).convert("RGB")
def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
"""Extract image and text prompt from message content"""
if isinstance(content, str):
return None, content
image = None
text = ""
for item in content:
if item.type == "image_url" and item.image_url:
image = decode_image(item.image_url.url)
elif item.type == "text" and item.text:
text = item.text
return image, text
def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
"""Generate response using PaddleOCR-VL"""
load_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
if DEVICE == "cuda":
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
use_cache=True
)
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract the assistant's response (after the prompt)
if "assistant" in response.lower():
parts = response.split("assistant")
if len(parts) > 1:
response = parts[-1].strip()
return response
@app.on_event("startup")
async def startup_event():
"""Pre-load the model on startup"""
logger.info("Pre-loading PaddleOCR-VL model...")
try:
load_model()
logger.info("Model pre-loaded successfully")
except Exception as e:
logger.error(f"Failed to pre-load model: {e}")
# Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
return HealthResponse(
status="healthy" if model is not None else "loading",
model=MODEL_NAME,
device=DEVICE
)
@app.get("/v1/models")
async def list_models():
"""List available models (OpenAI-compatible)"""
return {
"object": "list",
"data": [
{
"id": "paddleocr-vl",
"object": "model",
"created": int(time.time()),
"owned_by": "paddlepaddle"
}
]
}
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
"""
OpenAI-compatible chat completions endpoint for PaddleOCR-VL
Supports tasks:
- "OCR:" - Text recognition
- "Table Recognition:" - Table extraction
- "Formula Recognition:" - Formula extraction
- "Chart Recognition:" - Chart extraction
"""
try:
# Get the last user message
user_message = None
for msg in reversed(request.messages):
if msg.role == "user":
user_message = msg
break
if not user_message:
raise HTTPException(status_code=400, detail="No user message found")
# Extract image and prompt
image, prompt = extract_image_and_text(user_message.content)
if image is None:
raise HTTPException(status_code=400, detail="No image provided in message")
# Default to OCR if no specific prompt
if not prompt or prompt.strip() == "":
prompt = "OCR:"
logger.info(f"Processing request with prompt: {prompt[:50]}...")
# Generate response
start_time = time.time()
response_text = generate_response(image, prompt, request.max_tokens or 4096)
elapsed = time.time() - start_time
logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
# Build OpenAI-compatible response
return ChatCompletionResponse(
id=f"chatcmpl-{int(time.time()*1000)}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=100, # Approximate
completion_tokens=len(response_text) // 4,
total_tokens=100 + len(response_text) // 4
)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error processing request: {e}")
raise HTTPException(status_code=500, detail=str(e))
# Legacy endpoint for compatibility with old PaddleOCR API
class LegacyOCRRequest(BaseModel):
image: str
task: Optional[str] = "ocr"
class LegacyOCRResponse(BaseModel):
success: bool
result: str
task: str
error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
async def legacy_ocr(request: LegacyOCRRequest):
"""
Legacy OCR endpoint for backwards compatibility
Tasks: ocr, table, formula, chart
"""
try:
image = decode_image(request.image)
prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
result = generate_response(image, prompt)
return LegacyOCRResponse(
success=True,
result=result,
task=request.task
)
except Exception as e:
logger.error(f"Legacy OCR error: {e}")
return LegacyOCRResponse(
success=False,
result="",
task=request.task,
error=str(e)
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)