feat(ocr): add PaddleOCR GPU Docker image and FastAPI OCR server with entrypoint; implement OCR endpoints and consensus extraction testing
This commit is contained in:
258
image_support_files/paddleocr-server.py
Normal file
258
image_support_files/paddleocr-server.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PaddleOCR FastAPI Server
|
||||
Provides REST API for OCR operations using PaddleOCR
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Optional, List, Any
|
||||
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Environment configuration
|
||||
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
|
||||
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI(
|
||||
title="PaddleOCR Server",
|
||||
description="REST API for OCR operations using PaddleOCR PP-OCRv4",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Global OCR instance
|
||||
ocr_instance: Optional[PaddleOCR] = None
|
||||
|
||||
|
||||
class OCRRequest(BaseModel):
|
||||
"""Request model for base64 image OCR"""
|
||||
image: str
|
||||
language: Optional[str] = None
|
||||
|
||||
|
||||
class BoundingBox(BaseModel):
|
||||
"""Bounding box for detected text"""
|
||||
points: List[List[float]]
|
||||
|
||||
|
||||
class OCRResult(BaseModel):
|
||||
"""Single OCR detection result"""
|
||||
text: str
|
||||
confidence: float
|
||||
box: List[List[float]]
|
||||
|
||||
|
||||
class OCRResponse(BaseModel):
|
||||
"""OCR response model"""
|
||||
success: bool
|
||||
results: List[OCRResult]
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""Health check response"""
|
||||
status: str
|
||||
model: str
|
||||
language: str
|
||||
gpu_enabled: bool
|
||||
|
||||
|
||||
def get_ocr() -> PaddleOCR:
|
||||
"""Get or initialize the OCR instance"""
|
||||
global ocr_instance
|
||||
if ocr_instance is None:
|
||||
logger.info(f"Initializing PaddleOCR with language={OCR_LANGUAGE}, use_gpu={USE_GPU}")
|
||||
ocr_instance = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang=OCR_LANGUAGE,
|
||||
use_gpu=USE_GPU,
|
||||
show_log=False
|
||||
)
|
||||
logger.info("PaddleOCR initialized successfully")
|
||||
return ocr_instance
|
||||
|
||||
|
||||
def decode_base64_image(base64_string: str) -> np.ndarray:
|
||||
"""Decode base64 string to numpy array"""
|
||||
# Remove data URL prefix if present
|
||||
if ',' in base64_string:
|
||||
base64_string = base64_string.split(',')[1]
|
||||
|
||||
image_data = base64.b64decode(base64_string)
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
# Convert to RGB if necessary
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
return np.array(image)
|
||||
|
||||
|
||||
def process_ocr_result(result: Any) -> List[OCRResult]:
|
||||
"""Process PaddleOCR result into structured format"""
|
||||
results = []
|
||||
|
||||
if result is None or len(result) == 0:
|
||||
return results
|
||||
|
||||
# PaddleOCR returns list of results per image
|
||||
# Each result is a list of [box, (text, confidence)]
|
||||
for line in result[0] if result[0] else []:
|
||||
if line is None:
|
||||
continue
|
||||
|
||||
box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
text_info = line[1] # (text, confidence)
|
||||
|
||||
results.append(OCRResult(
|
||||
text=text_info[0],
|
||||
confidence=float(text_info[1]),
|
||||
box=[[float(p[0]), float(p[1])] for p in box]
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Pre-warm the OCR model on startup"""
|
||||
logger.info("Pre-warming OCR model...")
|
||||
try:
|
||||
ocr = get_ocr()
|
||||
# Create a small test image to warm up the model
|
||||
test_image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
test_image.fill(255) # White image
|
||||
ocr.ocr(test_image, cls=True)
|
||||
logger.info("OCR model pre-warmed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to pre-warm OCR model: {e}")
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
try:
|
||||
# Ensure OCR is initialized
|
||||
get_ocr()
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
model="PP-OCRv4",
|
||||
language=OCR_LANGUAGE,
|
||||
gpu_enabled=USE_GPU
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Health check failed: {e}")
|
||||
raise HTTPException(status_code=503, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/ocr", response_model=OCRResponse)
|
||||
async def ocr_base64(request: OCRRequest):
|
||||
"""
|
||||
Perform OCR on a base64-encoded image
|
||||
|
||||
Args:
|
||||
request: OCRRequest with base64 image and optional language
|
||||
|
||||
Returns:
|
||||
OCRResponse with detected text, confidence scores, and bounding boxes
|
||||
"""
|
||||
try:
|
||||
# Decode image
|
||||
image = decode_base64_image(request.image)
|
||||
|
||||
# Get OCR instance (use request language if provided)
|
||||
ocr = get_ocr()
|
||||
|
||||
# If a different language is requested, create a new instance
|
||||
if request.language and request.language != OCR_LANGUAGE:
|
||||
logger.info(f"Creating OCR instance for language: {request.language}")
|
||||
temp_ocr = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang=request.language,
|
||||
use_gpu=USE_GPU,
|
||||
show_log=False
|
||||
)
|
||||
result = temp_ocr.ocr(image, cls=True)
|
||||
else:
|
||||
result = ocr.ocr(image, cls=True)
|
||||
|
||||
# Process results
|
||||
results = process_ocr_result(result)
|
||||
|
||||
return OCRResponse(success=True, results=results)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR processing failed: {e}")
|
||||
return OCRResponse(success=False, results=[], error=str(e))
|
||||
|
||||
|
||||
@app.post("/ocr/upload", response_model=OCRResponse)
|
||||
async def ocr_upload(
|
||||
img: UploadFile = File(...),
|
||||
language: Optional[str] = Form(None)
|
||||
):
|
||||
"""
|
||||
Perform OCR on an uploaded image file
|
||||
|
||||
Args:
|
||||
img: Uploaded image file
|
||||
language: Optional language code (default: env OCR_LANGUAGE)
|
||||
|
||||
Returns:
|
||||
OCRResponse with detected text, confidence scores, and bounding boxes
|
||||
"""
|
||||
try:
|
||||
# Read image
|
||||
contents = await img.read()
|
||||
image = Image.open(io.BytesIO(contents))
|
||||
|
||||
# Convert to RGB if necessary
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
image_array = np.array(image)
|
||||
|
||||
# Get OCR instance
|
||||
ocr = get_ocr()
|
||||
|
||||
# If a different language is requested, create a new instance
|
||||
if language and language != OCR_LANGUAGE:
|
||||
logger.info(f"Creating OCR instance for language: {language}")
|
||||
temp_ocr = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang=language,
|
||||
use_gpu=USE_GPU,
|
||||
show_log=False
|
||||
)
|
||||
result = temp_ocr.ocr(image_array, cls=True)
|
||||
else:
|
||||
result = ocr.ocr(image_array, cls=True)
|
||||
|
||||
# Process results
|
||||
results = process_ocr_result(result)
|
||||
|
||||
return OCRResponse(success=True, results=results)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR processing failed: {e}")
|
||||
return OCRResponse(success=False, results=[], error=str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=5000)
|
||||
Reference in New Issue
Block a user