update

2026-01-16 16:21:44 +00:00
parent 3c5cf578a5
commit 15ac1fcf67
13 changed files with 873 additions and 805 deletions
--- a/image_support_files/paddleocr-entrypoint.sh
+++ b/image_support_files/paddleocr-entrypoint.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-# Configuration from environment
-OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
-SERVER_PORT="${SERVER_PORT:-5000}"
-SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
-
-echo "Starting PaddleOCR Server..."
-echo "  Language: ${OCR_LANGUAGE}"
-echo "  Host: ${SERVER_HOST}"
-echo "  Port: ${SERVER_PORT}"
-
-# Check GPU availability
-if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
-    echo "  GPU: Disabled (CPU mode)"
-else
-    echo "  GPU: Enabled"
-fi
-
-# Start the FastAPI server with uvicorn
-exec python -m uvicorn paddleocr_server:app \
-    --host "${SERVER_HOST}" \
-    --port "${SERVER_PORT}" \
-    --workers 1
--- a/image_support_files/paddleocr-vl-cpu-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-cpu-entrypoint.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+echo "==================================="
+echo "PaddleOCR-VL Server (CPU)"
+echo "==================================="
+
+HOST="${SERVER_HOST:-0.0.0.0}"
+PORT="${SERVER_PORT:-8000}"
+
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Device: CPU (no GPU)"
+echo ""
+
+echo "Starting PaddleOCR-VL CPU server..."
+echo "==================================="
+
+exec python /app/paddleocr_vl_server.py
--- a/image_support_files/paddleocr-vl-entrypoint.sh
+++ b/image_support_files/paddleocr-vl-entrypoint.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+echo "==================================="
+echo "PaddleOCR-VL Server"
+echo "==================================="
+
+# Configuration
+MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
+HOST="${HOST:-0.0.0.0}"
+PORT="${PORT:-8000}"
+MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
+
+echo "Model: ${MODEL_NAME}"
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
+echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
+echo ""
+
+# Check GPU availability
+if command -v nvidia-smi &> /dev/null; then
+    echo "GPU Information:"
+    nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+    echo ""
+else
+    echo "WARNING: nvidia-smi not found. GPU may not be available."
+fi
+
+echo "Starting vLLM server..."
+echo "==================================="
+
+# Start vLLM server with PaddleOCR-VL
+exec vllm serve "${MODEL_NAME}" \
+    --trust-remote-code \
+    --host "${HOST}" \
+    --port "${PORT}" \
+    --max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
+    --no-enable-prefix-caching \
+    --mm-processor-cache-gb 0 \
+    --served-model-name "paddleocr-vl"
--- a/image_support_files/paddleocr_server.py
+++ b/image_support_files/paddleocr_server.py
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-"""
-PaddleOCR FastAPI Server
-Provides REST API for OCR operations using PaddleOCR
-"""
-
-import os
-import io
-import base64
-import logging
-from typing import Optional, List, Any
-
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-import numpy as np
-from PIL import Image
-from paddleocr import PaddleOCR
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# Environment configuration
-OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
-# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
-USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
-
-# Initialize FastAPI app
-app = FastAPI(
-    title="PaddleOCR Server",
-    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
-    version="1.0.0"
-)
-
-# Global OCR instance
-ocr_instance: Optional[PaddleOCR] = None
-
-
-class OCRRequest(BaseModel):
-    """Request model for base64 image OCR"""
-    image: str
-    language: Optional[str] = None
-
-
-class BoundingBox(BaseModel):
-    """Bounding box for detected text"""
-    points: List[List[float]]
-
-
-class OCRResult(BaseModel):
-    """Single OCR detection result"""
-    text: str
-    confidence: float
-    box: List[List[float]]
-
-
-class OCRResponse(BaseModel):
-    """OCR response model"""
-    success: bool
-    results: List[OCRResult]
-    error: Optional[str] = None
-
-
-class HealthResponse(BaseModel):
-    """Health check response"""
-    status: str
-    model: str
-    language: str
-    gpu_enabled: bool
-
-
-def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
-    """Get or initialize the OCR instance"""
-    global ocr_instance
-    use_lang = lang or OCR_LANGUAGE
-
-    # Return cached instance if same language
-    if ocr_instance is not None and lang is None:
-        return ocr_instance
-
-    logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
-    new_ocr = PaddleOCR(
-        use_angle_cls=True,
-        lang=use_lang,
-        use_gpu=USE_GPU,
-        show_log=False
-    )
-
-    # Cache the default language instance
-    if lang is None:
-        ocr_instance = new_ocr
-
-    logger.info("PaddleOCR initialized successfully")
-    return new_ocr
-
-
-def decode_base64_image(base64_string: str) -> np.ndarray:
-    """Decode base64 string to numpy array"""
-    # Remove data URL prefix if present
-    if ',' in base64_string:
-        base64_string = base64_string.split(',')[1]
-
-    image_data = base64.b64decode(base64_string)
-    image = Image.open(io.BytesIO(image_data))
-
-    # Convert to RGB if necessary
-    if image.mode != 'RGB':
-        image = image.convert('RGB')
-
-    return np.array(image)
-
-
-def process_ocr_result(result: Any) -> List[OCRResult]:
-    """Process PaddleOCR result into structured format"""
-    results = []
-
-    if result is None or len(result) == 0:
-        return results
-
-    # PaddleOCR returns list of results per image
-    # Each result is a list of [box, (text, confidence)]
-    for line in result[0] if result[0] else []:
-        if line is None:
-            continue
-
-        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
-        text_info = line[1]  # (text, confidence)
-
-        results.append(OCRResult(
-            text=text_info[0],
-            confidence=float(text_info[1]),
-            box=[[float(p[0]), float(p[1])] for p in box]
-        ))
-
-    return results
-
-
-@app.on_event("startup")
-async def startup_event():
-    """Pre-warm the OCR model on startup"""
-    logger.info("Pre-warming OCR model...")
-    try:
-        ocr = get_ocr()
-        # Create a small test image to warm up the model
-        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
-        test_image.fill(255)  # White image
-        ocr.ocr(test_image, cls=True)
-        logger.info("OCR model pre-warmed successfully")
-    except Exception as e:
-        logger.error(f"Failed to pre-warm OCR model: {e}")
-
-
-@app.get("/health", response_model=HealthResponse)
-async def health_check():
-    """Health check endpoint"""
-    try:
-        # Ensure OCR is initialized
-        get_ocr()
-        return HealthResponse(
-            status="healthy",
-            model="PP-OCRv4",
-            language=OCR_LANGUAGE,
-            gpu_enabled=USE_GPU
-        )
-    except Exception as e:
-        logger.error(f"Health check failed: {e}")
-        raise HTTPException(status_code=503, detail=str(e))
-
-
-@app.post("/ocr", response_model=OCRResponse)
-async def ocr_base64(request: OCRRequest):
-    """
-    Perform OCR on a base64-encoded image
-
-    Args:
-        request: OCRRequest with base64 image and optional language
-
-    Returns:
-        OCRResponse with detected text, confidence scores, and bounding boxes
-    """
-    try:
-        # Decode image
-        image = decode_base64_image(request.image)
-
-        # Get OCR instance (use request language if provided)
-        if request.language and request.language != OCR_LANGUAGE:
-            ocr = get_ocr(request.language)
-        else:
-            ocr = get_ocr()
-
-        result = ocr.ocr(image, cls=True)
-
-        # Process results
-        results = process_ocr_result(result)
-
-        return OCRResponse(success=True, results=results)
-
-    except Exception as e:
-        logger.error(f"OCR processing failed: {e}")
-        return OCRResponse(success=False, results=[], error=str(e))
-
-
-@app.post("/ocr/upload", response_model=OCRResponse)
-async def ocr_upload(
-    img: UploadFile = File(...),
-    language: Optional[str] = Form(None)
-):
-    """
-    Perform OCR on an uploaded image file
-
-    Args:
-        img: Uploaded image file
-        language: Optional language code (default: env OCR_LANGUAGE)
-
-    Returns:
-        OCRResponse with detected text, confidence scores, and bounding boxes
-    """
-    try:
-        # Read image
-        contents = await img.read()
-        image = Image.open(io.BytesIO(contents))
-
-        # Convert to RGB if necessary
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-
-        image_array = np.array(image)
-
-        # Get OCR instance
-        if language and language != OCR_LANGUAGE:
-            ocr = get_ocr(language)
-        else:
-            ocr = get_ocr()
-
-        result = ocr.ocr(image_array, cls=True)
-
-        # Process results
-        results = process_ocr_result(result)
-
-        return OCRResponse(success=True, results=results)
-
-    except Exception as e:
-        logger.error(f"OCR processing failed: {e}")
-        return OCRResponse(success=False, results=[], error=str(e))
-
-
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""
+PaddleOCR-VL FastAPI Server (CPU variant)
+Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
+"""
+
+import os
+import io
+import base64
+import logging
+import time
+from typing import Optional, List, Any, Dict, Union
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import torch
+from PIL import Image
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Environment configuration
+SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
+SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
+MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
+
+# Device configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {DEVICE}")
+
+# Task prompts for PaddleOCR-VL
+TASK_PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "formula": "Formula Recognition:",
+    "chart": "Chart Recognition:",
+}
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="PaddleOCR-VL Server",
+    description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
+    version="1.0.0"
+)
+
+# Global model instances
+model = None
+processor = None
+
+
+# Request/Response models (OpenAI-compatible)
+class ImageUrl(BaseModel):
+    url: str
+
+
+class ContentItem(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Optional[ImageUrl] = None
+
+
+class Message(BaseModel):
+    role: str
+    content: Union[str, List[ContentItem]]
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str = "paddleocr-vl"
+    messages: List[Message]
+    temperature: Optional[float] = 0.0
+    max_tokens: Optional[int] = 4096
+
+
+class Choice(BaseModel):
+    index: int
+    message: Message
+    finish_reason: str
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Choice]
+    usage: Usage
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model: str
+    device: str
+
+
+def load_model():
+    """Load the PaddleOCR-VL model and processor"""
+    global model, processor
+
+    if model is not None:
+        return
+
+    logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
+
+    from transformers import AutoModelForCausalLM, AutoProcessor
+
+    # Load processor
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    # Load model with appropriate settings for CPU/GPU
+    if DEVICE == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        ).to(DEVICE).eval()
+    else:
+        # CPU mode - use float32 for compatibility
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+        ).eval()
+
+    logger.info("PaddleOCR-VL model loaded successfully")
+
+
+def decode_image(image_source: str) -> Image.Image:
+    """Decode image from URL or base64"""
+    if image_source.startswith("data:"):
+        # Base64 encoded image
+        header, data = image_source.split(",", 1)
+        image_data = base64.b64decode(data)
+        return Image.open(io.BytesIO(image_data)).convert("RGB")
+    elif image_source.startswith("http://") or image_source.startswith("https://"):
+        # URL - fetch image
+        import httpx
+        response = httpx.get(image_source, timeout=30.0)
+        response.raise_for_status()
+        return Image.open(io.BytesIO(response.content)).convert("RGB")
+    else:
+        # Assume it's a file path or raw base64
+        try:
+            image_data = base64.b64decode(image_source)
+            return Image.open(io.BytesIO(image_data)).convert("RGB")
+        except:
+            # Try as file path
+            return Image.open(image_source).convert("RGB")
+
+
+def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
+    """Extract image and text prompt from message content"""
+    if isinstance(content, str):
+        return None, content
+
+    image = None
+    text = ""
+
+    for item in content:
+        if item.type == "image_url" and item.image_url:
+            image = decode_image(item.image_url.url)
+        elif item.type == "text" and item.text:
+            text = item.text
+
+    return image, text
+
+
+def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
+    """Generate response using PaddleOCR-VL"""
+    load_model()
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]
+        }
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+
+    if DEVICE == "cuda":
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+
+    with torch.inference_mode():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            use_cache=True
+        )
+
+    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+
+    # Extract the assistant's response (after the prompt)
+    if "assistant" in response.lower():
+        parts = response.split("assistant")
+        if len(parts) > 1:
+            response = parts[-1].strip()
+
+    return response
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Pre-load the model on startup"""
+    logger.info("Pre-loading PaddleOCR-VL model...")
+    try:
+        load_model()
+        logger.info("Model pre-loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to pre-load model: {e}")
+        # Don't fail startup - model will be loaded on first request
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy" if model is not None else "loading",
+        model=MODEL_NAME,
+        device=DEVICE
+    )
+
+
+@app.get("/v1/models")
+async def list_models():
+    """List available models (OpenAI-compatible)"""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "paddleocr-vl",
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "paddlepaddle"
+            }
+        ]
+    }
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    """
+    OpenAI-compatible chat completions endpoint for PaddleOCR-VL
+
+    Supports tasks:
+    - "OCR:" - Text recognition
+    - "Table Recognition:" - Table extraction
+    - "Formula Recognition:" - Formula extraction
+    - "Chart Recognition:" - Chart extraction
+    """
+    try:
+        # Get the last user message
+        user_message = None
+        for msg in reversed(request.messages):
+            if msg.role == "user":
+                user_message = msg
+                break
+
+        if not user_message:
+            raise HTTPException(status_code=400, detail="No user message found")
+
+        # Extract image and prompt
+        image, prompt = extract_image_and_text(user_message.content)
+
+        if image is None:
+            raise HTTPException(status_code=400, detail="No image provided in message")
+
+        # Default to OCR if no specific prompt
+        if not prompt or prompt.strip() == "":
+            prompt = "OCR:"
+
+        logger.info(f"Processing request with prompt: {prompt[:50]}...")
+
+        # Generate response
+        start_time = time.time()
+        response_text = generate_response(image, prompt, request.max_tokens or 4096)
+        elapsed = time.time() - start_time
+
+        logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
+
+        # Build OpenAI-compatible response
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{int(time.time()*1000)}",
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                Choice(
+                    index=0,
+                    message=Message(role="assistant", content=response_text),
+                    finish_reason="stop"
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=100,  # Approximate
+                completion_tokens=len(response_text) // 4,
+                total_tokens=100 + len(response_text) // 4
+            )
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# Legacy endpoint for compatibility with old PaddleOCR API
+class LegacyOCRRequest(BaseModel):
+    image: str
+    task: Optional[str] = "ocr"
+
+
+class LegacyOCRResponse(BaseModel):
+    success: bool
+    result: str
+    task: str
+    error: Optional[str] = None
+
+
+@app.post("/ocr", response_model=LegacyOCRResponse)
+async def legacy_ocr(request: LegacyOCRRequest):
+    """
+    Legacy OCR endpoint for backwards compatibility
+
+    Tasks: ocr, table, formula, chart
+    """
+    try:
+        image = decode_image(request.image)
+        prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
+
+        result = generate_response(image, prompt)
+
+        return LegacyOCRResponse(
+            success=True,
+            result=result,
+            task=request.task
+        )
+    except Exception as e:
+        logger.error(f"Legacy OCR error: {e}")
+        return LegacyOCRResponse(
+            success=False,
+            result="",
+            task=request.task,
+            error=str(e)
+        )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)