#!/usr/bin/env python3 """ PaddleOCR-VL FastAPI Server (CPU variant) Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL """ import os import io import base64 import logging import time from typing import Optional, List, Any, Dict, Union from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel import torch from PIL import Image # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Environment configuration SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0') SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000')) MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL') # Device configuration DEVICE = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {DEVICE}") # Task prompts for PaddleOCR-VL TASK_PROMPTS = { "ocr": "OCR:", "table": "Table Recognition:", "formula": "Formula Recognition:", "chart": "Chart Recognition:", } # Initialize FastAPI app app = FastAPI( title="PaddleOCR-VL Server", description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL", version="1.0.0" ) # Global model instances model = None processor = None # Request/Response models (OpenAI-compatible) class ImageUrl(BaseModel): url: str class ContentItem(BaseModel): type: str text: Optional[str] = None image_url: Optional[ImageUrl] = None class Message(BaseModel): role: str content: Union[str, List[ContentItem]] class ChatCompletionRequest(BaseModel): model: str = "paddleocr-vl" messages: List[Message] temperature: Optional[float] = 0.0 max_tokens: Optional[int] = 4096 class Choice(BaseModel): index: int message: Message finish_reason: str class Usage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int class ChatCompletionResponse(BaseModel): id: str object: str = "chat.completion" created: int model: str choices: List[Choice] usage: Usage class HealthResponse(BaseModel): status: str model: str device: str def load_model(): """Load the PaddleOCR-VL model and processor""" global model, processor if model is not None: return logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}") from transformers import AutoModelForCausalLM, AutoProcessor # Load processor processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) # Load model with appropriate settings for CPU/GPU if DEVICE == "cuda": model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(DEVICE).eval() else: # CPU mode - use float32 for compatibility model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=torch.float32, low_cpu_mem_usage=True, ).eval() logger.info("PaddleOCR-VL model loaded successfully") def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image: """ Optimize image resolution for PaddleOCR-VL. Best results are achieved with images in the 1080p-2K range. - Images larger than max_size are scaled down - Very small images are scaled up to min_size """ width, height = image.size max_dim = max(width, height) min_dim = min(width, height) # Scale down if too large (4K+ images often miss text) if max_dim > max_size: scale = max_size / max_dim new_width = int(width * scale) new_height = int(height * scale) logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}") image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) # Scale up if too small elif max_dim < min_size and min_dim < min_size: scale = min_size / max_dim new_width = int(width * scale) new_height = int(height * scale) logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}") image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) else: logger.info(f"Image size {width}x{height} is optimal, no scaling needed") return image def decode_image(image_source: str, optimize: bool = True) -> Image.Image: """ Decode image from various sources. Supported formats: - Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,... - HTTP/HTTPS URL: https://example.com/image.png - Raw base64 string - Local file path Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF """ image = None if image_source.startswith("data:"): # Base64 encoded image with MIME type header # Supports: data:image/png;base64,... data:image/jpeg;base64,... etc. header, data = image_source.split(",", 1) image_data = base64.b64decode(data) image = Image.open(io.BytesIO(image_data)).convert("RGB") logger.debug(f"Decoded base64 image with header: {header}") elif image_source.startswith("http://") or image_source.startswith("https://"): # URL - fetch image import httpx response = httpx.get(image_source, timeout=30.0) response.raise_for_status() image = Image.open(io.BytesIO(response.content)).convert("RGB") logger.debug(f"Fetched image from URL: {image_source[:50]}...") else: # Assume it's a file path or raw base64 try: image_data = base64.b64decode(image_source) image = Image.open(io.BytesIO(image_data)).convert("RGB") logger.debug("Decoded raw base64 image") except: # Try as file path image = Image.open(image_source).convert("RGB") logger.debug(f"Loaded image from file: {image_source}") # Optimize resolution for best OCR results if optimize: image = optimize_image_resolution(image) return image def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple: """Extract image and text prompt from message content""" if isinstance(content, str): return None, content image = None text = "" for item in content: if item.type == "image_url" and item.image_url: image = decode_image(item.image_url.url) elif item.type == "text" and item.text: text = item.text return image, text def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str: """Generate response using PaddleOCR-VL""" load_model() messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt}, ] } ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) if DEVICE == "cuda": inputs = {k: v.to(DEVICE) for k, v in inputs.items()} with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, use_cache=True ) response = processor.batch_decode(outputs, skip_special_tokens=True)[0] # Extract the assistant's response (after the prompt) if "assistant" in response.lower(): parts = response.split("assistant") if len(parts) > 1: response = parts[-1].strip() return response @app.on_event("startup") async def startup_event(): """Pre-load the model on startup""" logger.info("Pre-loading PaddleOCR-VL model...") try: load_model() logger.info("Model pre-loaded successfully") except Exception as e: logger.error(f"Failed to pre-load model: {e}") # Don't fail startup - model will be loaded on first request @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" return HealthResponse( status="healthy" if model is not None else "loading", model=MODEL_NAME, device=DEVICE ) @app.get("/formats") async def supported_formats(): """List supported image formats and input methods""" return { "image_formats": { "supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"], "recommended": ["PNG", "JPEG"], "mime_types": [ "image/png", "image/jpeg", "image/webp", "image/bmp", "image/gif", "image/tiff" ] }, "input_methods": { "base64_data_url": { "description": "Base64 encoded image with MIME type header", "example": "data:image/png;base64,iVBORw0KGgo..." }, "http_url": { "description": "Direct HTTP/HTTPS URL to image", "example": "https://example.com/image.png" }, "raw_base64": { "description": "Raw base64 string without header", "example": "iVBORw0KGgo..." } }, "resolution": { "optimal_range": "1080p to 2K (1080-2048 pixels on longest side)", "auto_scaling": True, "note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy." }, "task_prompts": TASK_PROMPTS } @app.get("/v1/models") async def list_models(): """List available models (OpenAI-compatible)""" return { "object": "list", "data": [ { "id": "paddleocr-vl", "object": "model", "created": int(time.time()), "owned_by": "paddlepaddle" } ] } @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def chat_completions(request: ChatCompletionRequest): """ OpenAI-compatible chat completions endpoint for PaddleOCR-VL Supports tasks: - "OCR:" - Text recognition - "Table Recognition:" - Table extraction - "Formula Recognition:" - Formula extraction - "Chart Recognition:" - Chart extraction """ try: # Get the last user message user_message = None for msg in reversed(request.messages): if msg.role == "user": user_message = msg break if not user_message: raise HTTPException(status_code=400, detail="No user message found") # Extract image and prompt image, prompt = extract_image_and_text(user_message.content) if image is None: raise HTTPException(status_code=400, detail="No image provided in message") # Default to OCR if no specific prompt if not prompt or prompt.strip() == "": prompt = "OCR:" logger.info(f"Processing request with prompt: {prompt[:50]}...") # Generate response start_time = time.time() response_text = generate_response(image, prompt, request.max_tokens or 4096) elapsed = time.time() - start_time logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)") # Build OpenAI-compatible response return ChatCompletionResponse( id=f"chatcmpl-{int(time.time()*1000)}", created=int(time.time()), model=request.model, choices=[ Choice( index=0, message=Message(role="assistant", content=response_text), finish_reason="stop" ) ], usage=Usage( prompt_tokens=100, # Approximate completion_tokens=len(response_text) // 4, total_tokens=100 + len(response_text) // 4 ) ) except HTTPException: raise except Exception as e: logger.error(f"Error processing request: {e}") raise HTTPException(status_code=500, detail=str(e)) # Legacy endpoint for compatibility with old PaddleOCR API class LegacyOCRRequest(BaseModel): image: str task: Optional[str] = "ocr" class LegacyOCRResponse(BaseModel): success: bool result: str task: str error: Optional[str] = None @app.post("/ocr", response_model=LegacyOCRResponse) async def legacy_ocr(request: LegacyOCRRequest): """ Legacy OCR endpoint for backwards compatibility Tasks: ocr, table, formula, chart """ try: image = decode_image(request.image) prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"]) result = generate_response(image, prompt) return LegacyOCRResponse( success=True, result=result, task=request.task ) except Exception as e: logger.error(f"Legacy OCR error: {e}") return LegacyOCRResponse( success=False, result="", task=request.task, error=str(e) ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)