#!/usr/bin/env python3 """ PaddleOCR FastAPI Server Provides REST API for OCR operations using PaddleOCR """ import os import io import base64 import logging from typing import Optional, List, Any from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel import numpy as np from PIL import Image from paddleocr import PaddleOCR # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Environment configuration OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en') # GPU is controlled via CUDA_VISIBLE_DEVICES environment variable USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1' # Initialize FastAPI app app = FastAPI( title="PaddleOCR Server", description="REST API for OCR operations using PaddleOCR PP-OCRv4", version="1.0.0" ) # Global OCR instance ocr_instance: Optional[PaddleOCR] = None class OCRRequest(BaseModel): """Request model for base64 image OCR""" image: str language: Optional[str] = None class BoundingBox(BaseModel): """Bounding box for detected text""" points: List[List[float]] class OCRResult(BaseModel): """Single OCR detection result""" text: str confidence: float box: List[List[float]] class OCRResponse(BaseModel): """OCR response model""" success: bool results: List[OCRResult] error: Optional[str] = None class HealthResponse(BaseModel): """Health check response""" status: str model: str language: str gpu_enabled: bool def get_ocr(lang: Optional[str] = None) -> PaddleOCR: """Get or initialize the OCR instance""" global ocr_instance use_lang = lang or OCR_LANGUAGE # Return cached instance if same language if ocr_instance is not None and lang is None: return ocr_instance logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}") new_ocr = PaddleOCR( use_angle_cls=True, lang=use_lang, use_gpu=USE_GPU, show_log=False ) # Cache the default language instance if lang is None: ocr_instance = new_ocr logger.info("PaddleOCR initialized successfully") return new_ocr def decode_base64_image(base64_string: str) -> np.ndarray: """Decode base64 string to numpy array""" # Remove data URL prefix if present if ',' in base64_string: base64_string = base64_string.split(',')[1] image_data = base64.b64decode(base64_string) image = Image.open(io.BytesIO(image_data)) # Convert to RGB if necessary if image.mode != 'RGB': image = image.convert('RGB') return np.array(image) def process_ocr_result(result: Any) -> List[OCRResult]: """Process PaddleOCR result into structured format""" results = [] if result is None or len(result) == 0: return results # PaddleOCR returns list of results per image # Each result is a list of [box, (text, confidence)] for line in result[0] if result[0] else []: if line is None: continue box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] text_info = line[1] # (text, confidence) results.append(OCRResult( text=text_info[0], confidence=float(text_info[1]), box=[[float(p[0]), float(p[1])] for p in box] )) return results @app.on_event("startup") async def startup_event(): """Pre-warm the OCR model on startup""" logger.info("Pre-warming OCR model...") try: ocr = get_ocr() # Create a small test image to warm up the model test_image = np.zeros((100, 100, 3), dtype=np.uint8) test_image.fill(255) # White image ocr.ocr(test_image, cls=True) logger.info("OCR model pre-warmed successfully") except Exception as e: logger.error(f"Failed to pre-warm OCR model: {e}") @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" try: # Ensure OCR is initialized get_ocr() return HealthResponse( status="healthy", model="PP-OCRv4", language=OCR_LANGUAGE, gpu_enabled=USE_GPU ) except Exception as e: logger.error(f"Health check failed: {e}") raise HTTPException(status_code=503, detail=str(e)) @app.post("/ocr", response_model=OCRResponse) async def ocr_base64(request: OCRRequest): """ Perform OCR on a base64-encoded image Args: request: OCRRequest with base64 image and optional language Returns: OCRResponse with detected text, confidence scores, and bounding boxes """ try: # Decode image image = decode_base64_image(request.image) # Get OCR instance (use request language if provided) if request.language and request.language != OCR_LANGUAGE: ocr = get_ocr(request.language) else: ocr = get_ocr() result = ocr.ocr(image, cls=True) # Process results results = process_ocr_result(result) return OCRResponse(success=True, results=results) except Exception as e: logger.error(f"OCR processing failed: {e}") return OCRResponse(success=False, results=[], error=str(e)) @app.post("/ocr/upload", response_model=OCRResponse) async def ocr_upload( img: UploadFile = File(...), language: Optional[str] = Form(None) ): """ Perform OCR on an uploaded image file Args: img: Uploaded image file language: Optional language code (default: env OCR_LANGUAGE) Returns: OCRResponse with detected text, confidence scores, and bounding boxes """ try: # Read image contents = await img.read() image = Image.open(io.BytesIO(contents)) # Convert to RGB if necessary if image.mode != 'RGB': image = image.convert('RGB') image_array = np.array(image) # Get OCR instance if language and language != OCR_LANGUAGE: ocr = get_ocr(language) else: ocr = get_ocr() result = ocr.ocr(image_array, cls=True) # Process results results = process_ocr_result(result) return OCRResponse(success=True, results=results) except Exception as e: logger.error(f"OCR processing failed: {e}") return OCRResponse(success=False, results=[], error=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=5000)