feat(paddleocr-vl): add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers
This commit is contained in:
@@ -136,27 +136,82 @@ def load_model():
|
||||
logger.info("PaddleOCR-VL model loaded successfully")
|
||||
|
||||
|
||||
def decode_image(image_source: str) -> Image.Image:
|
||||
"""Decode image from URL or base64"""
|
||||
def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image:
|
||||
"""
|
||||
Optimize image resolution for PaddleOCR-VL.
|
||||
|
||||
Best results are achieved with images in the 1080p-2K range.
|
||||
- Images larger than max_size are scaled down
|
||||
- Very small images are scaled up to min_size
|
||||
"""
|
||||
width, height = image.size
|
||||
max_dim = max(width, height)
|
||||
min_dim = min(width, height)
|
||||
|
||||
# Scale down if too large (4K+ images often miss text)
|
||||
if max_dim > max_size:
|
||||
scale = max_size / max_dim
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}")
|
||||
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
# Scale up if too small
|
||||
elif max_dim < min_size and min_dim < min_size:
|
||||
scale = min_size / max_dim
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}")
|
||||
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
else:
|
||||
logger.info(f"Image size {width}x{height} is optimal, no scaling needed")
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def decode_image(image_source: str, optimize: bool = True) -> Image.Image:
|
||||
"""
|
||||
Decode image from various sources.
|
||||
|
||||
Supported formats:
|
||||
- Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,...
|
||||
- HTTP/HTTPS URL: https://example.com/image.png
|
||||
- Raw base64 string
|
||||
- Local file path
|
||||
|
||||
Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF
|
||||
"""
|
||||
image = None
|
||||
|
||||
if image_source.startswith("data:"):
|
||||
# Base64 encoded image
|
||||
# Base64 encoded image with MIME type header
|
||||
# Supports: data:image/png;base64,... data:image/jpeg;base64,... etc.
|
||||
header, data = image_source.split(",", 1)
|
||||
image_data = base64.b64decode(data)
|
||||
return Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||
image = Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||
logger.debug(f"Decoded base64 image with header: {header}")
|
||||
elif image_source.startswith("http://") or image_source.startswith("https://"):
|
||||
# URL - fetch image
|
||||
import httpx
|
||||
response = httpx.get(image_source, timeout=30.0)
|
||||
response.raise_for_status()
|
||||
return Image.open(io.BytesIO(response.content)).convert("RGB")
|
||||
image = Image.open(io.BytesIO(response.content)).convert("RGB")
|
||||
logger.debug(f"Fetched image from URL: {image_source[:50]}...")
|
||||
else:
|
||||
# Assume it's a file path or raw base64
|
||||
try:
|
||||
image_data = base64.b64decode(image_source)
|
||||
return Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||
image = Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||
logger.debug("Decoded raw base64 image")
|
||||
except:
|
||||
# Try as file path
|
||||
return Image.open(image_source).convert("RGB")
|
||||
image = Image.open(image_source).convert("RGB")
|
||||
logger.debug(f"Loaded image from file: {image_source}")
|
||||
|
||||
# Optimize resolution for best OCR results
|
||||
if optimize:
|
||||
image = optimize_image_resolution(image)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
|
||||
@@ -242,6 +297,45 @@ async def health_check():
|
||||
)
|
||||
|
||||
|
||||
@app.get("/formats")
|
||||
async def supported_formats():
|
||||
"""List supported image formats and input methods"""
|
||||
return {
|
||||
"image_formats": {
|
||||
"supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
|
||||
"recommended": ["PNG", "JPEG"],
|
||||
"mime_types": [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/webp",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/tiff"
|
||||
]
|
||||
},
|
||||
"input_methods": {
|
||||
"base64_data_url": {
|
||||
"description": "Base64 encoded image with MIME type header",
|
||||
"example": "..."
|
||||
},
|
||||
"http_url": {
|
||||
"description": "Direct HTTP/HTTPS URL to image",
|
||||
"example": "https://example.com/image.png"
|
||||
},
|
||||
"raw_base64": {
|
||||
"description": "Raw base64 string without header",
|
||||
"example": "iVBORw0KGgo..."
|
||||
}
|
||||
},
|
||||
"resolution": {
|
||||
"optimal_range": "1080p to 2K (1080-2048 pixels on longest side)",
|
||||
"auto_scaling": True,
|
||||
"note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy."
|
||||
},
|
||||
"task_prompts": TASK_PROMPTS
|
||||
}
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models():
|
||||
"""List available models (OpenAI-compatible)"""
|
||||
|
||||
Reference in New Issue
Block a user