feat(ocr): add PaddleOCR GPU Docker image and FastAPI OCR server with entrypoint; implement OCR endpoints and consensus extraction testing

2026-01-16 10:22:15 +00:00
parent 3dc1881d8b
commit 379b5c19eb
7 changed files with 847 additions and 14 deletions
--- a/image_support_files/paddleocr-server.py
+++ b/image_support_files/paddleocr-server.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+PaddleOCR FastAPI Server
+Provides REST API for OCR operations using PaddleOCR
+"""
+
+import os
+import io
+import base64
+import logging
+from typing import Optional, List, Any
+
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import numpy as np
+from PIL import Image
+from paddleocr import PaddleOCR
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Environment configuration
+OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
+USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="PaddleOCR Server",
+    description="REST API for OCR operations using PaddleOCR PP-OCRv4",
+    version="1.0.0"
+)
+
+# Global OCR instance
+ocr_instance: Optional[PaddleOCR] = None
+
+
+class OCRRequest(BaseModel):
+    """Request model for base64 image OCR"""
+    image: str
+    language: Optional[str] = None
+
+
+class BoundingBox(BaseModel):
+    """Bounding box for detected text"""
+    points: List[List[float]]
+
+
+class OCRResult(BaseModel):
+    """Single OCR detection result"""
+    text: str
+    confidence: float
+    box: List[List[float]]
+
+
+class OCRResponse(BaseModel):
+    """OCR response model"""
+    success: bool
+    results: List[OCRResult]
+    error: Optional[str] = None
+
+
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    model: str
+    language: str
+    gpu_enabled: bool
+
+
+def get_ocr() -> PaddleOCR:
+    """Get or initialize the OCR instance"""
+    global ocr_instance
+    if ocr_instance is None:
+        logger.info(f"Initializing PaddleOCR with language={OCR_LANGUAGE}, use_gpu={USE_GPU}")
+        ocr_instance = PaddleOCR(
+            use_angle_cls=True,
+            lang=OCR_LANGUAGE,
+            use_gpu=USE_GPU,
+            show_log=False
+        )
+        logger.info("PaddleOCR initialized successfully")
+    return ocr_instance
+
+
+def decode_base64_image(base64_string: str) -> np.ndarray:
+    """Decode base64 string to numpy array"""
+    # Remove data URL prefix if present
+    if ',' in base64_string:
+        base64_string = base64_string.split(',')[1]
+
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+
+    # Convert to RGB if necessary
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+
+    return np.array(image)
+
+
+def process_ocr_result(result: Any) -> List[OCRResult]:
+    """Process PaddleOCR result into structured format"""
+    results = []
+
+    if result is None or len(result) == 0:
+        return results
+
+    # PaddleOCR returns list of results per image
+    # Each result is a list of [box, (text, confidence)]
+    for line in result[0] if result[0] else []:
+        if line is None:
+            continue
+
+        box = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        text_info = line[1]  # (text, confidence)
+
+        results.append(OCRResult(
+            text=text_info[0],
+            confidence=float(text_info[1]),
+            box=[[float(p[0]), float(p[1])] for p in box]
+        ))
+
+    return results
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Pre-warm the OCR model on startup"""
+    logger.info("Pre-warming OCR model...")
+    try:
+        ocr = get_ocr()
+        # Create a small test image to warm up the model
+        test_image = np.zeros((100, 100, 3), dtype=np.uint8)
+        test_image.fill(255)  # White image
+        ocr.ocr(test_image, cls=True)
+        logger.info("OCR model pre-warmed successfully")
+    except Exception as e:
+        logger.error(f"Failed to pre-warm OCR model: {e}")
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    try:
+        # Ensure OCR is initialized
+        get_ocr()
+        return HealthResponse(
+            status="healthy",
+            model="PP-OCRv4",
+            language=OCR_LANGUAGE,
+            gpu_enabled=USE_GPU
+        )
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(status_code=503, detail=str(e))
+
+
+@app.post("/ocr", response_model=OCRResponse)
+async def ocr_base64(request: OCRRequest):
+    """
+    Perform OCR on a base64-encoded image
+
+    Args:
+        request: OCRRequest with base64 image and optional language
+
+    Returns:
+        OCRResponse with detected text, confidence scores, and bounding boxes
+    """
+    try:
+        # Decode image
+        image = decode_base64_image(request.image)
+
+        # Get OCR instance (use request language if provided)
+        ocr = get_ocr()
+
+        # If a different language is requested, create a new instance
+        if request.language and request.language != OCR_LANGUAGE:
+            logger.info(f"Creating OCR instance for language: {request.language}")
+            temp_ocr = PaddleOCR(
+                use_angle_cls=True,
+                lang=request.language,
+                use_gpu=USE_GPU,
+                show_log=False
+            )
+            result = temp_ocr.ocr(image, cls=True)
+        else:
+            result = ocr.ocr(image, cls=True)
+
+        # Process results
+        results = process_ocr_result(result)
+
+        return OCRResponse(success=True, results=results)
+
+    except Exception as e:
+        logger.error(f"OCR processing failed: {e}")
+        return OCRResponse(success=False, results=[], error=str(e))
+
+
+@app.post("/ocr/upload", response_model=OCRResponse)
+async def ocr_upload(
+    img: UploadFile = File(...),
+    language: Optional[str] = Form(None)
+):
+    """
+    Perform OCR on an uploaded image file
+
+    Args:
+        img: Uploaded image file
+        language: Optional language code (default: env OCR_LANGUAGE)
+
+    Returns:
+        OCRResponse with detected text, confidence scores, and bounding boxes
+    """
+    try:
+        # Read image
+        contents = await img.read()
+        image = Image.open(io.BytesIO(contents))
+
+        # Convert to RGB if necessary
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+
+        image_array = np.array(image)
+
+        # Get OCR instance
+        ocr = get_ocr()
+
+        # If a different language is requested, create a new instance
+        if language and language != OCR_LANGUAGE:
+            logger.info(f"Creating OCR instance for language: {language}")
+            temp_ocr = PaddleOCR(
+                use_angle_cls=True,
+                lang=language,
+                use_gpu=USE_GPU,
+                show_log=False
+            )
+            result = temp_ocr.ocr(image_array, cls=True)
+        else:
+            result = ocr.ocr(image_array, cls=True)
+
+        # Process results
+        results = process_ocr_result(result)
+
+        return OCRResponse(success=True, results=results)
+
+    except Exception as e:
+        logger.error(f"OCR processing failed: {e}")
+        return OCRResponse(success=False, results=[], error=str(e))
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=5000)