#!/usr/bin/env python3 """ PaddleOCR FastAPI Server Provides REST API for OCR operations using PaddleOCR """ import os import io import base64 import logging from typing import Optional, List, Any from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel import numpy as np from PIL import Image from paddleocr import PaddleOCR # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Environment configuration OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en') USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1' # Initialize FastAPI app app = FastAPI( title="PaddleOCR Server", description="REST API for OCR operations using PaddleOCR PP-OCRv4", version="1.0.0" ) # Global OCR instance ocr_instance: Optional[PaddleOCR] = None class OCRRequest(BaseModel): """Request model for base64 image OCR""" image: str language: Optional[str] = None class BoundingBox(BaseModel): """Bounding box for detected text""" points: List[List[float]] class OCRResult(BaseModel): """Single OCR detection result""" text: str confidence: float box: List[List[float]] class OCRResponse(BaseModel): """OCR response model""" success: bool results: List[OCRResult] error: Optional[str] = None class HealthResponse(BaseModel): """Health check response""" status: str model: str language: str gpu_enabled: bool def get_ocr() -> PaddleOCR: """Get or initialize the OCR instance""" global ocr_instance if ocr_instance is None: logger.info(f"Initializing PaddleOCR with language={OCR_LANGUAGE}, use_gpu={USE_GPU}") ocr_instance = PaddleOCR( use_angle_cls=True, lang=OCR_LANGUAGE, use_gpu=USE_GPU, show_log=False ) logger.info("PaddleOCR initialized successfully") return ocr_instance def decode_base64_image(base64_string: str) -> np.ndarray: """Decode base64 string to numpy array""" # Remove data URL prefix if present if ',' in base64_string: base64_string = base64_string.split(',')[1] image_data = base64.b64decode(base64_string) image = Image.open(io.BytesIO(image_data)) # Convert to RGB if necessary if image.mode != 'RGB': image = image.convert('RGB') return np.array(image) def process_ocr_result(result: Any) -> List[OCRResult]: """Process PaddleOCR result into structured format""" results = [] if result is None or len(result) == 0: return results # PaddleOCR returns list of results per image # Each result is a list of [box, (text, confidence)] for line in result[0] if result[0] else []: if line is None: continue box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] text_info = line[1] # (text, confidence) results.append(OCRResult( text=text_info[0], confidence=float(text_info[1]), box=[[float(p[0]), float(p[1])] for p in box] )) return results @app.on_event("startup") async def startup_event(): """Pre-warm the OCR model on startup""" logger.info("Pre-warming OCR model...") try: ocr = get_ocr() # Create a small test image to warm up the model test_image = np.zeros((100, 100, 3), dtype=np.uint8) test_image.fill(255) # White image ocr.ocr(test_image, cls=True) logger.info("OCR model pre-warmed successfully") except Exception as e: logger.error(f"Failed to pre-warm OCR model: {e}") @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" try: # Ensure OCR is initialized get_ocr() return HealthResponse( status="healthy", model="PP-OCRv4", language=OCR_LANGUAGE, gpu_enabled=USE_GPU ) except Exception as e: logger.error(f"Health check failed: {e}") raise HTTPException(status_code=503, detail=str(e)) @app.post("/ocr", response_model=OCRResponse) async def ocr_base64(request: OCRRequest): """ Perform OCR on a base64-encoded image Args: request: OCRRequest with base64 image and optional language Returns: OCRResponse with detected text, confidence scores, and bounding boxes """ try: # Decode image image = decode_base64_image(request.image) # Get OCR instance (use request language if provided) ocr = get_ocr() # If a different language is requested, create a new instance if request.language and request.language != OCR_LANGUAGE: logger.info(f"Creating OCR instance for language: {request.language}") temp_ocr = PaddleOCR( use_angle_cls=True, lang=request.language, use_gpu=USE_GPU, show_log=False ) result = temp_ocr.ocr(image, cls=True) else: result = ocr.ocr(image, cls=True) # Process results results = process_ocr_result(result) return OCRResponse(success=True, results=results) except Exception as e: logger.error(f"OCR processing failed: {e}") return OCRResponse(success=False, results=[], error=str(e)) @app.post("/ocr/upload", response_model=OCRResponse) async def ocr_upload( img: UploadFile = File(...), language: Optional[str] = Form(None) ): """ Perform OCR on an uploaded image file Args: img: Uploaded image file language: Optional language code (default: env OCR_LANGUAGE) Returns: OCRResponse with detected text, confidence scores, and bounding boxes """ try: # Read image contents = await img.read() image = Image.open(io.BytesIO(contents)) # Convert to RGB if necessary if image.mode != 'RGB': image = image.convert('RGB') image_array = np.array(image) # Get OCR instance ocr = get_ocr() # If a different language is requested, create a new instance if language and language != OCR_LANGUAGE: logger.info(f"Creating OCR instance for language: {language}") temp_ocr = PaddleOCR( use_angle_cls=True, lang=language, use_gpu=USE_GPU, show_log=False ) result = temp_ocr.ocr(image_array, cls=True) else: result = ocr.ocr(image_array, cls=True) # Process results results = process_ocr_result(result) return OCRResponse(success=True, results=results) except Exception as e: logger.error(f"OCR processing failed: {e}") return OCRResponse(success=False, results=[], error=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=5000)