ht-docker-ai/image_support_files/paddleocr_server.py

#!/usr/bin/env python3
"""
PaddleOCR FastAPI Server
Provides REST API for OCR operations using PaddleOCR
"""

import os
import io
import base64
import logging
from typing import Optional, List, Any

from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Environment configuration
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'

# Initialize FastAPI app
app = FastAPI(
    title="PaddleOCR Server",
    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
    version="1.0.0"
)

# Global OCR instance
ocr_instance: Optional[PaddleOCR] = None


class OCRRequest(BaseModel):
    """Request model for base64 image OCR"""
    image: str
    language: Optional[str] = None


class BoundingBox(BaseModel):
    """Bounding box for detected text"""
    points: List[List[float]]


class OCRResult(BaseModel):
    """Single OCR detection result"""
    text: str
    confidence: float
    box: List[List[float]]


class OCRResponse(BaseModel):
    """OCR response model"""
    success: bool
    results: List[OCRResult]
    error: Optional[str] = None


class HealthResponse(BaseModel):
    """Health check response"""
    status: str
    model: str
    language: str
    gpu_enabled: bool


def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
    """Get or initialize the OCR instance"""
    global ocr_instance
    use_lang = lang or OCR_LANGUAGE

    # Return cached instance if same language
    if ocr_instance is not None and lang is None:
        return ocr_instance

    logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
    new_ocr = PaddleOCR(
        use_angle_cls=True,
        lang=use_lang,
        use_gpu=USE_GPU,
        show_log=False
    )

    # Cache the default language instance
    if lang is None:
        ocr_instance = new_ocr

    logger.info("PaddleOCR initialized successfully")
    return new_ocr


def decode_base64_image(base64_string: str) -> np.ndarray:
    """Decode base64 string to numpy array"""
    # Remove data URL prefix if present
    if ',' in base64_string:
        base64_string = base64_string.split(',')[1]

    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))

    # Convert to RGB if necessary
    if image.mode != 'RGB':
        image = image.convert('RGB')

    return np.array(image)


def process_ocr_result(result: Any) -> List[OCRResult]:
    """Process PaddleOCR result into structured format"""
    results = []

    if result is None or len(result) == 0:
        return results

    # PaddleOCR returns list of results per image
    # Each result is a list of [box, (text, confidence)]
    for line in result[0] if result[0] else []:
        if line is None:
            continue

        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        text_info = line[1]  # (text, confidence)

        results.append(OCRResult(
            text=text_info[0],
            confidence=float(text_info[1]),
            box=[[float(p[0]), float(p[1])] for p in box]
        ))

    return results


@app.on_event("startup")
async def startup_event():
    """Pre-warm the OCR model on startup"""
    logger.info("Pre-warming OCR model...")
    try:
        ocr = get_ocr()
        # Create a small test image to warm up the model
        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
        test_image.fill(255)  # White image
        ocr.ocr(test_image, cls=True)
        logger.info("OCR model pre-warmed successfully")
    except Exception as e:
        logger.error(f"Failed to pre-warm OCR model: {e}")


@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    try:
        # Ensure OCR is initialized
        get_ocr()
        return HealthResponse(
            status="healthy",
            model="PP-OCRv4",
            language=OCR_LANGUAGE,
            gpu_enabled=USE_GPU
        )
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        raise HTTPException(status_code=503, detail=str(e))


@app.post("/ocr", response_model=OCRResponse)
async def ocr_base64(request: OCRRequest):
    """
    Perform OCR on a base64-encoded image

    Args:
        request: OCRRequest with base64 image and optional language

    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Decode image
        image = decode_base64_image(request.image)

        # Get OCR instance (use request language if provided)
        if request.language and request.language != OCR_LANGUAGE:
            ocr = get_ocr(request.language)
        else:
            ocr = get_ocr()

        result = ocr.ocr(image, cls=True)

        # Process results
        results = process_ocr_result(result)

        return OCRResponse(success=True, results=results)

    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))


@app.post("/ocr/upload", response_model=OCRResponse)
async def ocr_upload(
    img: UploadFile = File(...),
    language: Optional[str] = Form(None)
):
    """
    Perform OCR on an uploaded image file

    Args:
        img: Uploaded image file
        language: Optional language code (default: env OCR_LANGUAGE)

    Returns:
        OCRResponse with detected text, confidence scores, and bounding boxes
    """
    try:
        # Read image
        contents = await img.read()
        image = Image.open(io.BytesIO(contents))

        # Convert to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')

        image_array = np.array(image)

        # Get OCR instance
        if language and language != OCR_LANGUAGE:
            ocr = get_ocr(language)
        else:
            ocr = get_ocr()

        result = ocr.ocr(image_array, cls=True)

        # Process results
        results = process_ocr_result(result)

        return OCRResponse(success=True, results=results)

    except Exception as e:
        logger.error(f"OCR processing failed: {e}")
        return OCRResponse(success=False, results=[], error=str(e))


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)