feat(paddleocr-vl): add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers

2026-01-17 20:22:23 +00:00
parent addae20cbd
commit 80e6866442
12 changed files with 2414 additions and 21 deletions
--- a/image_support_files/paddleocr_vl_server.py
+++ b/image_support_files/paddleocr_vl_server.py
@@ -136,27 +136,82 @@ def load_model():
    logger.info("PaddleOCR-VL model loaded successfully")


-def decode_image(image_source: str) -> Image.Image:
-    """Decode image from URL or base64"""
+def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image:
+    """
+    Optimize image resolution for PaddleOCR-VL.
+
+    Best results are achieved with images in the 1080p-2K range.
+    - Images larger than max_size are scaled down
+    - Very small images are scaled up to min_size
+    """
+    width, height = image.size
+    max_dim = max(width, height)
+    min_dim = min(width, height)
+
+    # Scale down if too large (4K+ images often miss text)
+    if max_dim > max_size:
+        scale = max_size / max_dim
+        new_width = int(width * scale)
+        new_height = int(height * scale)
+        logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}")
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    # Scale up if too small
+    elif max_dim < min_size and min_dim < min_size:
+        scale = min_size / max_dim
+        new_width = int(width * scale)
+        new_height = int(height * scale)
+        logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}")
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    else:
+        logger.info(f"Image size {width}x{height} is optimal, no scaling needed")
+
+    return image
+
+
+def decode_image(image_source: str, optimize: bool = True) -> Image.Image:
+    """
+    Decode image from various sources.
+
+    Supported formats:
+    - Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,...
+    - HTTP/HTTPS URL: https://example.com/image.png
+    - Raw base64 string
+    - Local file path
+
+    Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF
+    """
+    image = None
+
    if image_source.startswith("data:"):
-        # Base64 encoded image
+        # Base64 encoded image with MIME type header
+        # Supports: data:image/png;base64,... data:image/jpeg;base64,... etc.
        header, data = image_source.split(",", 1)
        image_data = base64.b64decode(data)
-        return Image.open(io.BytesIO(image_data)).convert("RGB")
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        logger.debug(f"Decoded base64 image with header: {header}")
    elif image_source.startswith("http://") or image_source.startswith("https://"):
        # URL - fetch image
        import httpx
        response = httpx.get(image_source, timeout=30.0)
        response.raise_for_status()
-        return Image.open(io.BytesIO(response.content)).convert("RGB")
+        image = Image.open(io.BytesIO(response.content)).convert("RGB")
+        logger.debug(f"Fetched image from URL: {image_source[:50]}...")
    else:
        # Assume it's a file path or raw base64
        try:
            image_data = base64.b64decode(image_source)
-            return Image.open(io.BytesIO(image_data)).convert("RGB")
+            image = Image.open(io.BytesIO(image_data)).convert("RGB")
+            logger.debug("Decoded raw base64 image")
        except:
            # Try as file path
-            return Image.open(image_source).convert("RGB")
+            image = Image.open(image_source).convert("RGB")
+            logger.debug(f"Loaded image from file: {image_source}")
+
+    # Optimize resolution for best OCR results
+    if optimize:
+        image = optimize_image_resolution(image)
+
+    return image


 def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
@@ -242,6 +297,45 @@ async def health_check():
    )


+@app.get("/formats")
+async def supported_formats():
+    """List supported image formats and input methods"""
+    return {
+        "image_formats": {
+            "supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
+            "recommended": ["PNG", "JPEG"],
+            "mime_types": [
+                "image/png",
+                "image/jpeg",
+                "image/webp",
+                "image/bmp",
+                "image/gif",
+                "image/tiff"
+            ]
+        },
+        "input_methods": {
+            "base64_data_url": {
+                "description": "Base64 encoded image with MIME type header",
+                "example": "data:image/png;base64,iVBORw0KGgo..."
+            },
+            "http_url": {
+                "description": "Direct HTTP/HTTPS URL to image",
+                "example": "https://example.com/image.png"
+            },
+            "raw_base64": {
+                "description": "Raw base64 string without header",
+                "example": "iVBORw0KGgo..."
+            }
+        },
+        "resolution": {
+            "optimal_range": "1080p to 2K (1080-2048 pixels on longest side)",
+            "auto_scaling": True,
+            "note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy."
+        },
+        "task_prompts": TASK_PROMPTS
+    }
+
+
@app.get("/v1/models")
 async def list_models():
    """List available models (OpenAI-compatible)"""