update

2026-01-16 16:21:44 +00:00
parent 3c5cf578a5
commit 15ac1fcf67
13 changed files with 873 additions and 805 deletions
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""
+PaddleOCR-VL FastAPI Server (CPU variant)
+Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
+"""
+
+import os
+import io
+import base64
+import logging
+import time
+from typing import Optional, List, Any, Dict, Union
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import torch
+from PIL import Image
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Environment configuration
+SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
+SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
+MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
+
+# Device configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {DEVICE}")
+
+# Task prompts for PaddleOCR-VL
+TASK_PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "formula": "Formula Recognition:",
+    "chart": "Chart Recognition:",
+}
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="PaddleOCR-VL Server",
+    description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
+    version="1.0.0"
+)
+
+# Global model instances
+model = None
+processor = None
+
+
+# Request/Response models (OpenAI-compatible)
+class ImageUrl(BaseModel):
+    url: str
+
+
+class ContentItem(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Optional[ImageUrl] = None
+
+
+class Message(BaseModel):
+    role: str
+    content: Union[str, List[ContentItem]]
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str = "paddleocr-vl"
+    messages: List[Message]
+    temperature: Optional[float] = 0.0
+    max_tokens: Optional[int] = 4096
+
+
+class Choice(BaseModel):
+    index: int
+    message: Message
+    finish_reason: str
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Choice]
+    usage: Usage
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model: str
+    device: str
+
+
+def load_model():
+    """Load the PaddleOCR-VL model and processor"""
+    global model, processor
+
+    if model is not None:
+        return
+
+    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
+
+    from transformers import AutoModelForCausalLM, AutoProcessor
+
+    # Load processor
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    # Load model with appropriate settings for CPU/GPU
+    if DEVICE == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        ).to(DEVICE).eval()
+    else:
+        # CPU mode - use float32 for compatibility
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+        ).eval()
+
+    logger.info("PaddleOCR-VL model loaded successfully")
+
+
+def decode_image(image_source: str) -> Image.Image:
+    """Decode image from URL or base64"""
+    if image_source.startswith("data:"):
+        # Base64 encoded image
+        header, data = image_source.split(",", 1)
+        image_data = base64.b64decode(data)
+        return Image.open(io.BytesIO(image_data)).convert("RGB")
+    elif image_source.startswith("http://") or image_source.startswith("https://"):
+        # URL - fetch image
+        import httpx
+        response = httpx.get(image_source, timeout=30.0)
+        response.raise_for_status()
+        return Image.open(io.BytesIO(response.content)).convert("RGB")
+    else:
+        # Assume it's a file path or raw base64
+        try:
+            image_data = base64.b64decode(image_source)
+            return Image.open(io.BytesIO(image_data)).convert("RGB")
+        except:
+            # Try as file path
+            return Image.open(image_source).convert("RGB")
+
+
+def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
+    """Extract image and text prompt from message content"""
+    if isinstance(content, str):
+        return None, content
+
+    image = None
+    text = ""
+
+    for item in content:
+        if item.type == "image_url" and item.image_url:
+            image = decode_image(item.image_url.url)
+        elif item.type == "text" and item.text:
+            text = item.text
+
+    return image, text
+
+
+def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
+    """Generate response using PaddleOCR-VL"""
+    load_model()
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]
+        }
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+
+    if DEVICE == "cuda":
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+
+    with torch.inference_mode():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            use_cache=True
+        )
+
+    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+
+    # Extract the assistant's response (after the prompt)
+    if "assistant" in response.lower():
+        parts = response.split("assistant")
+        if len(parts) > 1:
+            response = parts[-1].strip()
+
+    return response
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Pre-load the model on startup"""
+    logger.info("Pre-loading PaddleOCR-VL model...")
+    try:
+        load_model()
+        logger.info("Model pre-loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to pre-load model: {e}")
+        # Don't fail startup - model will be loaded on first request
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy" if model is not None else "loading",
+        model=MODEL_NAME,
+        device=DEVICE
+    )
+
+
+@app.get("/v1/models")
+async def list_models():
+    """List available models (OpenAI-compatible)"""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "paddleocr-vl",
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "paddlepaddle"
+            }
+        ]
+    }
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    """
+    OpenAI-compatible chat completions endpoint for PaddleOCR-VL
+
+    Supports tasks:
+    - "OCR:" - Text recognition
+    - "Table Recognition:" - Table extraction
+    - "Formula Recognition:" - Formula extraction
+    - "Chart Recognition:" - Chart extraction
+    """
+    try:
+        # Get the last user message
+        user_message = None
+        for msg in reversed(request.messages):
+            if msg.role == "user":
+                user_message = msg
+                break
+
+        if not user_message:
+            raise HTTPException(status_code=400, detail="No user message found")
+
+        # Extract image and prompt
+        image, prompt = extract_image_and_text(user_message.content)
+
+        if image is None:
+            raise HTTPException(status_code=400, detail="No image provided in message")
+
+        # Default to OCR if no specific prompt
+        if not prompt or prompt.strip() == "":
+            prompt = "OCR:"
+
+        logger.info(f"Processing request with prompt: {prompt[:50]}...")
+
+        # Generate response
+        start_time = time.time()
+        response_text = generate_response(image, prompt, request.max_tokens or 4096)
+        elapsed = time.time() - start_time
+
+        logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
+
+        # Build OpenAI-compatible response
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{int(time.time()*1000)}",
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                Choice(
+                    index=0,
+                    message=Message(role="assistant", content=response_text),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=100,  # Approximate
+                completion_tokens=len(response_text) // 4,
+                total_tokens=100 + len(response_text) // 4
+            )
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# Legacy endpoint for compatibility with old PaddleOCR API
+class LegacyOCRRequest(BaseModel):
+    image: str
+    task: Optional[str] = "ocr"
+
+
+class LegacyOCRResponse(BaseModel):
+    success: bool
+    result: str
+    task: str
+    error: Optional[str] = None
+
+
+@app.post("/ocr", response_model=LegacyOCRResponse)
+async def legacy_ocr(request: LegacyOCRRequest):
+    """
+    Legacy OCR endpoint for backwards compatibility
+
+    Tasks: ocr, table, formula, chart
+    """
+    try:
+        image = decode_image(request.image)
+        prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
+
+        result = generate_response(image, prompt)
+
+        return LegacyOCRResponse(
+            success=True,
+            result=result,
+            task=request.task
+        )
+    except Exception as e:
+        logger.error(f"Legacy OCR error: {e}")
+        return LegacyOCRResponse(
+            success=False,
+            result="",
+            task=request.task,
+            error=str(e)
+        )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)