This commit is contained in:
2026-01-16 16:21:44 +00:00
parent 3c5cf578a5
commit 15ac1fcf67
13 changed files with 873 additions and 805 deletions

View File

@@ -1,49 +0,0 @@
# PaddleOCR GPU Variant
# OCR processing with NVIDIA GPU support using PaddlePaddle
FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV OCR_LANGUAGE="en"
ENV SERVER_PORT="5000"
ENV SERVER_HOST="0.0.0.0"
ENV PYTHONUNBUFFERED=1
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies (using stable paddleocr 2.x)
RUN pip install --no-cache-dir \
paddleocr==2.8.1 \
fastapi \
uvicorn[standard] \
python-multipart \
opencv-python-headless \
pillow
# Copy server files
COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Note: OCR models will be downloaded on first run
# This ensures compatibility across different GPU architectures
# Expose API port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]

View File

@@ -1,53 +0,0 @@
# PaddleOCR CPU Variant
# OCR processing optimized for CPU-only inference
FROM python:3.10-slim-bookworm
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration for CPU-only mode
ENV OCR_LANGUAGE="en"
ENV SERVER_PORT="5000"
ENV SERVER_HOST="0.0.0.0"
ENV PYTHONUNBUFFERED=1
# Disable GPU usage for CPU-only variant
ENV CUDA_VISIBLE_DEVICES="-1"
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
RUN pip install --no-cache-dir \
paddlepaddle==2.6.2 \
paddleocr==2.8.1 \
fastapi \
uvicorn[standard] \
python-multipart \
opencv-python-headless \
pillow
# Copy server files
COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Note: OCR models will be downloaded on first run
# This avoids build-time segfaults with certain CPU architectures
# Expose API port
EXPOSE 5000
# Health check (longer start-period for CPU variant)
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]

72
Dockerfile_paddleocr_vl Normal file
View File

@@ -0,0 +1,72 @@
# PaddleOCR-VL GPU Variant
# Vision-Language Model for document parsing using vLLM
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3.11-dev \
python3-pip \
git \
curl \
build-essential \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
# Create and activate virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 \
torchvision \
--index-url https://download.pytorch.org/whl/cu124
# Install vLLM (nightly for PaddleOCR-VL support)
RUN pip install --no-cache-dir \
vllm \
--pre \
--extra-index-url https://wheels.vllm.ai/nightly \
--extra-index-url https://download.pytorch.org/whl/cu124
# Install additional dependencies
RUN pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
openai \
httpx
# Copy entrypoint script
COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
# Expose vLLM API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]

View File

@@ -0,0 +1,54 @@
# PaddleOCR-VL CPU Variant
# Vision-Language Model for document parsing using transformers (slower, no GPU required)
FROM python:3.11-slim-bookworm
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV CUDA_VISIBLE_DEVICES=""
ENV SERVER_PORT=8000
ENV SERVER_HOST=0.0.0.0
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
git \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
httpx
# Copy server files
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
# Expose API port
EXPOSE 8000
# Health check (longer start-period for CPU + model download)
HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"]

View File

@@ -29,19 +29,19 @@ docker build \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
. .
# Build PaddleOCR GPU variant # Build PaddleOCR-VL GPU variant (vLLM)
echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}" echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
docker build \ docker build \
-f Dockerfile_paddleocr \ -f Dockerfile_paddleocr_vl \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
. .
# Build PaddleOCR CPU variant # Build PaddleOCR-VL CPU variant
echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}" echo -e "${GREEN}Building PaddleOCR-VL CPU variant...${NC}"
docker build \ docker build \
-f Dockerfile_paddleocr_cpu \ -f Dockerfile_paddleocr_vl_cpu \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu \
. .
echo -e "${GREEN}All images built successfully!${NC}" echo -e "${GREEN}All images built successfully!${NC}"
@@ -52,7 +52,7 @@ echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
echo "" echo ""
echo " PaddleOCR:" echo " PaddleOCR-VL (Vision-Language Model):"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl (GPU/vLLM)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu (GPU/vLLM)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu (CPU)"

View File

@@ -1,25 +0,0 @@
#!/bin/bash
set -e
# Configuration from environment
OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
SERVER_PORT="${SERVER_PORT:-5000}"
SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
echo "Starting PaddleOCR Server..."
echo " Language: ${OCR_LANGUAGE}"
echo " Host: ${SERVER_HOST}"
echo " Port: ${SERVER_PORT}"
# Check GPU availability
if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
echo " GPU: Disabled (CPU mode)"
else
echo " GPU: Enabled"
fi
# Start the FastAPI server with uvicorn
exec python -m uvicorn paddleocr_server:app \
--host "${SERVER_HOST}" \
--port "${SERVER_PORT}" \
--workers 1

View File

@@ -0,0 +1,19 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server (CPU)"
echo "==================================="
HOST="${SERVER_HOST:-0.0.0.0}"
PORT="${SERVER_PORT:-8000}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Device: CPU (no GPU)"
echo ""
echo "Starting PaddleOCR-VL CPU server..."
echo "==================================="
exec python /app/paddleocr_vl_server.py

View File

@@ -0,0 +1,43 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server"
echo "==================================="
# Configuration
MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
echo "Model: ${MODEL_NAME}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
echo ""
# Check GPU availability
if command -v nvidia-smi &> /dev/null; then
echo "GPU Information:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
echo ""
else
echo "WARNING: nvidia-smi not found. GPU may not be available."
fi
echo "Starting vLLM server..."
echo "==================================="
# Start vLLM server with PaddleOCR-VL
exec vllm serve "${MODEL_NAME}" \
--trust-remote-code \
--host "${HOST}" \
--port "${PORT}" \
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}" \
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}" \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--served-model-name "paddleocr-vl"

View File

@@ -1,253 +0,0 @@
#!/usr/bin/env python3
"""
PaddleOCR FastAPI Server
Provides REST API for OCR operations using PaddleOCR
"""
import os
import io
import base64
import logging
from typing import Optional, List, Any
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR Server",
description="REST API for OCR operations using PaddleOCR PP-OCRv4",
version="1.0.0"
)
# Global OCR instance
ocr_instance: Optional[PaddleOCR] = None
class OCRRequest(BaseModel):
"""Request model for base64 image OCR"""
image: str
language: Optional[str] = None
class BoundingBox(BaseModel):
"""Bounding box for detected text"""
points: List[List[float]]
class OCRResult(BaseModel):
"""Single OCR detection result"""
text: str
confidence: float
box: List[List[float]]
class OCRResponse(BaseModel):
"""OCR response model"""
success: bool
results: List[OCRResult]
error: Optional[str] = None
class HealthResponse(BaseModel):
"""Health check response"""
status: str
model: str
language: str
gpu_enabled: bool
def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
"""Get or initialize the OCR instance"""
global ocr_instance
use_lang = lang or OCR_LANGUAGE
# Return cached instance if same language
if ocr_instance is not None and lang is None:
return ocr_instance
logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
new_ocr = PaddleOCR(
use_angle_cls=True,
lang=use_lang,
use_gpu=USE_GPU,
show_log=False
)
# Cache the default language instance
if lang is None:
ocr_instance = new_ocr
logger.info("PaddleOCR initialized successfully")
return new_ocr
def decode_base64_image(base64_string: str) -> np.ndarray:
"""Decode base64 string to numpy array"""
# Remove data URL prefix if present
if ',' in base64_string:
base64_string = base64_string.split(',')[1]
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
return np.array(image)
def process_ocr_result(result: Any) -> List[OCRResult]:
"""Process PaddleOCR result into structured format"""
results = []
if result is None or len(result) == 0:
return results
# PaddleOCR returns list of results per image
# Each result is a list of [box, (text, confidence)]
for line in result[0] if result[0] else []:
if line is None:
continue
box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
text_info = line[1] # (text, confidence)
results.append(OCRResult(
text=text_info[0],
confidence=float(text_info[1]),
box=[[float(p[0]), float(p[1])] for p in box]
))
return results
@app.on_event("startup")
async def startup_event():
"""Pre-warm the OCR model on startup"""
logger.info("Pre-warming OCR model...")
try:
ocr = get_ocr()
# Create a small test image to warm up the model
test_image = np.zeros((100, 100, 3), dtype=np.uint8)
test_image.fill(255) # White image
ocr.ocr(test_image, cls=True)
logger.info("OCR model pre-warmed successfully")
except Exception as e:
logger.error(f"Failed to pre-warm OCR model: {e}")
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
try:
# Ensure OCR is initialized
get_ocr()
return HealthResponse(
status="healthy",
model="PP-OCRv4",
language=OCR_LANGUAGE,
gpu_enabled=USE_GPU
)
except Exception as e:
logger.error(f"Health check failed: {e}")
raise HTTPException(status_code=503, detail=str(e))
@app.post("/ocr", response_model=OCRResponse)
async def ocr_base64(request: OCRRequest):
"""
Perform OCR on a base64-encoded image
Args:
request: OCRRequest with base64 image and optional language
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Decode image
image = decode_base64_image(request.image)
# Get OCR instance (use request language if provided)
if request.language and request.language != OCR_LANGUAGE:
ocr = get_ocr(request.language)
else:
ocr = get_ocr()
result = ocr.ocr(image, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
@app.post("/ocr/upload", response_model=OCRResponse)
async def ocr_upload(
img: UploadFile = File(...),
language: Optional[str] = Form(None)
):
"""
Perform OCR on an uploaded image file
Args:
img: Uploaded image file
language: Optional language code (default: env OCR_LANGUAGE)
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Read image
contents = await img.read()
image = Image.open(io.BytesIO(contents))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
image_array = np.array(image)
# Get OCR instance
if language and language != OCR_LANGUAGE:
ocr = get_ocr(language)
else:
ocr = get_ocr()
result = ocr.ocr(image_array, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=5000)

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python3
"""
PaddleOCR-VL FastAPI Server (CPU variant)
Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
"""
import os
import io
import base64
import logging
import time
from typing import Optional, List, Any, Dict, Union
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch
from PIL import Image
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
# Task prompts for PaddleOCR-VL
TASK_PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR-VL Server",
description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
version="1.0.0"
)
# Global model instances
model = None
processor = None
# Request/Response models (OpenAI-compatible)
class ImageUrl(BaseModel):
url: str
class ContentItem(BaseModel):
type: str
text: Optional[str] = None
image_url: Optional[ImageUrl] = None
class Message(BaseModel):
role: str
content: Union[str, List[ContentItem]]
class ChatCompletionRequest(BaseModel):
model: str = "paddleocr-vl"
messages: List[Message]
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 4096
class Choice(BaseModel):
index: int
message: Message
finish_reason: str
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
usage: Usage
class HealthResponse(BaseModel):
status: str
model: str
device: str
def load_model():
"""Load the PaddleOCR-VL model and processor"""
global model, processor
if model is not None:
return
logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
from transformers import AutoModelForCausalLM, AutoProcessor
# Load processor
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Load model with appropriate settings for CPU/GPU
if DEVICE == "cuda":
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
else:
# CPU mode - use float32 for compatibility
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
).eval()
logger.info("PaddleOCR-VL model loaded successfully")
def decode_image(image_source: str) -> Image.Image:
"""Decode image from URL or base64"""
if image_source.startswith("data:"):
# Base64 encoded image
header, data = image_source.split(",", 1)
image_data = base64.b64decode(data)
return Image.open(io.BytesIO(image_data)).convert("RGB")
elif image_source.startswith("http://") or image_source.startswith("https://"):
# URL - fetch image
import httpx
response = httpx.get(image_source, timeout=30.0)
response.raise_for_status()
return Image.open(io.BytesIO(response.content)).convert("RGB")
else:
# Assume it's a file path or raw base64
try:
image_data = base64.b64decode(image_source)
return Image.open(io.BytesIO(image_data)).convert("RGB")
except:
# Try as file path
return Image.open(image_source).convert("RGB")
def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
"""Extract image and text prompt from message content"""
if isinstance(content, str):
return None, content
image = None
text = ""
for item in content:
if item.type == "image_url" and item.image_url:
image = decode_image(item.image_url.url)
elif item.type == "text" and item.text:
text = item.text
return image, text
def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
"""Generate response using PaddleOCR-VL"""
load_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
if DEVICE == "cuda":
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
use_cache=True
)
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract the assistant's response (after the prompt)
if "assistant" in response.lower():
parts = response.split("assistant")
if len(parts) > 1:
response = parts[-1].strip()
return response
@app.on_event("startup")
async def startup_event():
"""Pre-load the model on startup"""
logger.info("Pre-loading PaddleOCR-VL model...")
try:
load_model()
logger.info("Model pre-loaded successfully")
except Exception as e:
logger.error(f"Failed to pre-load model: {e}")
# Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
return HealthResponse(
status="healthy" if model is not None else "loading",
model=MODEL_NAME,
device=DEVICE
)
@app.get("/v1/models")
async def list_models():
"""List available models (OpenAI-compatible)"""
return {
"object": "list",
"data": [
{
"id": "paddleocr-vl",
"object": "model",
"created": int(time.time()),
"owned_by": "paddlepaddle"
}
]
}
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
"""
OpenAI-compatible chat completions endpoint for PaddleOCR-VL
Supports tasks:
- "OCR:" - Text recognition
- "Table Recognition:" - Table extraction
- "Formula Recognition:" - Formula extraction
- "Chart Recognition:" - Chart extraction
"""
try:
# Get the last user message
user_message = None
for msg in reversed(request.messages):
if msg.role == "user":
user_message = msg
break
if not user_message:
raise HTTPException(status_code=400, detail="No user message found")
# Extract image and prompt
image, prompt = extract_image_and_text(user_message.content)
if image is None:
raise HTTPException(status_code=400, detail="No image provided in message")
# Default to OCR if no specific prompt
if not prompt or prompt.strip() == "":
prompt = "OCR:"
logger.info(f"Processing request with prompt: {prompt[:50]}...")
# Generate response
start_time = time.time()
response_text = generate_response(image, prompt, request.max_tokens or 4096)
elapsed = time.time() - start_time
logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
# Build OpenAI-compatible response
return ChatCompletionResponse(
id=f"chatcmpl-{int(time.time()*1000)}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=100, # Approximate
completion_tokens=len(response_text) // 4,
total_tokens=100 + len(response_text) // 4
)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error processing request: {e}")
raise HTTPException(status_code=500, detail=str(e))
# Legacy endpoint for compatibility with old PaddleOCR API
class LegacyOCRRequest(BaseModel):
image: str
task: Optional[str] = "ocr"
class LegacyOCRResponse(BaseModel):
success: bool
result: str
task: str
error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
async def legacy_ocr(request: LegacyOCRRequest):
"""
Legacy OCR endpoint for backwards compatibility
Tasks: ocr, table, formula, chart
"""
try:
image = decode_image(request.image)
prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
result = generate_response(image, prompt)
return LegacyOCRResponse(
success=True,
result=result,
task=request.task
)
except Exception as e:
logger.error(f"Legacy OCR error: {e}")
return LegacyOCRResponse(
success=False,
result="",
task=request.task,
error=str(e)
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

View File

@@ -77,56 +77,73 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CPU variant has longer `start-period` (120s) due to slower startup. CPU variant has longer `start-period` (120s) due to slower startup.
## PaddleOCR ## PaddleOCR-VL (Recommended)
### Overview ### Overview
PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides: PaddleOCR-VL is a 0.9B parameter Vision-Language Model specifically optimized for document parsing. It replaces the older PP-Structure approach with native VLM understanding.
- Text detection and recognition **Key advantages over PP-Structure:**
- Multi-language support - Native table understanding (no HTML parsing needed)
- FastAPI REST API - 109 language support
- GPU and CPU variants - Better handling of complex multi-row tables
- Structured Markdown/JSON output
### Docker Images ### Docker Images
| Tag | Description | | Tag | Description |
|-----|-------------| |-----|-------------|
| `paddleocr` | GPU variant (default) | | `paddleocr-vl` | GPU variant using vLLM (recommended) |
| `paddleocr-gpu` | GPU variant (alias) | | `paddleocr-vl-cpu` | CPU variant using transformers |
| `paddleocr-cpu` | CPU-only variant |
### API Endpoints ### API Endpoints (OpenAI-compatible)
| Endpoint | Method | Description | | Endpoint | Method | Description |
|----------|--------|-------------| |----------|--------|-------------|
| `/health` | GET | Health check with model info | | `/health` | GET | Health check with model info |
| `/ocr` | POST | OCR with base64 image (JSON body) | | `/v1/models` | GET | List available models |
| `/ocr/upload` | POST | OCR with file upload (multipart form) | | `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
| `/ocr` | POST | Legacy OCR endpoint |
### Request/Response Format ### Request/Response Format
**POST /ocr (JSON)** **POST /v1/chat/completions (OpenAI-compatible)**
```json ```json
{ {
"image": "<base64-encoded-image>", "model": "paddleocr-vl",
"language": "en" // optional "messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
{"type": "text", "text": "Table Recognition:"}
]
}
],
"temperature": 0.0,
"max_tokens": 8192
} }
``` ```
**POST /ocr/upload (multipart)** **Task Prompts:**
- `img`: image file - `"OCR:"` - Text recognition
- `language`: optional language code - `"Table Recognition:"` - Table extraction (returns markdown)
- `"Formula Recognition:"` - Formula extraction
- `"Chart Recognition:"` - Chart extraction
**Response** **Response**
```json ```json
{ {
"success": true, "id": "chatcmpl-...",
"results": [ "object": "chat.completion",
"choices": [
{ {
"text": "Invoice #12345", "index": 0,
"confidence": 0.98, "message": {
"box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] "role": "assistant",
"content": "| Date | Description | Amount |\n|---|---|---|\n| 2021-06-01 | GITLAB INC | -119.96 |"
},
"finish_reason": "stop"
} }
] ]
} }
@@ -136,19 +153,16 @@ PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It pr
| Variable | Default | Description | | Variable | Default | Description |
|----------|---------|-------------| |----------|---------|-------------|
| `OCR_LANGUAGE` | `en` | Default language for OCR | | `MODEL_NAME` | `PaddlePaddle/PaddleOCR-VL` | Model to load |
| `SERVER_PORT` | `5000` | Server port | | `HOST` | `0.0.0.0` | Server host |
| `SERVER_HOST` | `0.0.0.0` | Server host | | `PORT` | `8000` | Server port |
| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only | | `MAX_BATCHED_TOKENS` | `16384` | vLLM max batch tokens |
| `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory usage (0-1) |
### Performance ### Performance
- **GPU**: ~1-3 seconds per page - **GPU (vLLM)**: ~2-5 seconds per page
- **CPU**: ~10-30 seconds per page - **CPU**: ~30-60 seconds per page
### Supported Languages
Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
--- ---
@@ -193,6 +207,43 @@ npmci docker build
npmci docker push code.foss.global npmci docker push code.foss.global
``` ```
## Multi-Pass Extraction Strategy
The bank statement extraction uses a dual-VLM consensus approach:
### Architecture: Dual-VLM Consensus
| VLM | Model | Purpose |
|-----|-------|---------|
| **MiniCPM-V 4.5** | 8B params | Primary visual extraction |
| **PaddleOCR-VL** | 0.9B params | Table-specialized extraction |
### Extraction Strategy
1. **Pass 1**: MiniCPM-V visual extraction (images → JSON)
2. **Pass 2**: PaddleOCR-VL table recognition (images → markdown → JSON)
3. **Consensus**: If Pass 1 == Pass 2 → Done (fast path)
4. **Pass 3+**: MiniCPM-V visual if no consensus
### Why Dual-VLM Works
- **Different architectures**: Two independent models cross-check each other
- **Specialized strengths**: PaddleOCR-VL optimized for tables, MiniCPM-V for general vision
- **No structure loss**: Both VLMs see the original images directly
- **Fast consensus**: Most documents complete in 2 passes when VLMs agree
### Comparison vs Old PP-Structure Approach
| Approach | Bank Statement Result | Issue |
|----------|----------------------|-------|
| MiniCPM-V Visual | 28 transactions ✓ | - |
| PP-Structure HTML + Visual | 13 transactions ✗ | HTML merged rows incorrectly |
| PaddleOCR-VL Table | 28 transactions ✓ | Native table understanding |
**Key insight**: PP-Structure's HTML output loses structure for complex tables. PaddleOCR-VL's native VLM approach maintains table integrity.
---
## Related Resources ## Related Resources
- [Ollama Documentation](https://ollama.ai/docs) - [Ollama Documentation](https://ollama.ai/docs)

View File

@@ -4,12 +4,16 @@ import * as path from 'path';
import { execSync } from 'child_process'; import { execSync } from 'child_process';
import * as os from 'os'; import * as os from 'os';
// Service URLs
const OLLAMA_URL = 'http://localhost:11434'; const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0'; const PADDLEOCR_VL_URL = 'http://localhost:8000';
const PADDLEOCR_URL = 'http://localhost:5000';
// Prompt for visual extraction (with images) // Models
const VISUAL_EXTRACT_PROMPT = `/nothink const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
// Prompt for MiniCPM-V visual extraction
const MINICPM_EXTRACT_PROMPT = `/nothink
You are a bank statement parser. Extract EVERY transaction from the table. You are a bank statement parser. Extract EVERY transaction from the table.
Read the Amount column carefully: Read the Amount column carefully:
@@ -21,9 +25,12 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any rows. Return ONLY the JSON array, no explanation.`; Do not skip any rows. Return ONLY the JSON array, no explanation.`;
// Prompt for OCR-only extraction (no images) // Prompt for PaddleOCR-VL table extraction
const OCR_EXTRACT_PROMPT = `/nothink const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
You are a bank statement parser. Extract EVERY transaction from the OCR text below.
// Post-processing prompt to convert PaddleOCR-VL output to JSON
const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
Convert the following bank statement table data to JSON.
Read the Amount values carefully: Read the Amount values carefully:
- "- 21,47 €" means DEBIT, output as: -21.47 - "- 21,47 €" means DEBIT, output as: -21.47
@@ -32,48 +39,12 @@ Read the Amount values carefully:
For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any transactions. Return ONLY the JSON array, no explanation.`; Return ONLY the JSON array, no explanation.
/** Table data:
* Build prompt for OCR-only extraction (no images)
*/
function buildOcrOnlyPrompt(ocrText: string): string {
// Limit OCR text to prevent context overflow
const maxOcrLength = 12000;
const truncatedOcr = ocrText.length > maxOcrLength
? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
: ocrText;
return `${OCR_EXTRACT_PROMPT}
OCR text from bank statement:
--- ---
${truncatedOcr} {TABLE_DATA}
---`; ---`;
}
/**
* Extract OCR text from an image using PaddleOCR
*/
async function extractOcrText(imageBase64: string): Promise<string> {
try {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
if (!response.ok) return '';
const data = await response.json();
if (data.success && data.results) {
return data.results.map((r: { text: string }) => r.text).join('\n');
}
} catch {
// PaddleOCR unavailable
}
return '';
}
interface ITransaction { interface ITransaction {
date: string; date: string;
@@ -94,7 +65,7 @@ function convertPdfToImages(pdfPath: string): string[] {
{ stdio: 'pipe' } { stdio: 'pipe' }
); );
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort(); const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = []; const images: string[] = [];
for (const file of files) { for (const file of files) {
@@ -110,12 +81,12 @@ function convertPdfToImages(pdfPath: string): string[] {
} }
/** /**
* Visual extraction pass (with images) * Extract using MiniCPM-V via Ollama
*/ */
async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> { async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
const payload = { const payload = {
model: MODEL, model: MINICPM_MODEL,
prompt: VISUAL_EXTRACT_PROMPT, prompt: MINICPM_EXTRACT_PROMPT,
images, images,
stream: true, stream: true,
options: { options: {
@@ -124,31 +95,6 @@ async function extractVisual(images: string[], passLabel: string): Promise<ITran
}, },
}; };
return doExtraction(payload, passLabel);
}
/**
* OCR-only extraction pass (no images, just text)
*/
async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
const payload = {
model: MODEL,
prompt: buildOcrOnlyPrompt(ocrText),
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
return doExtraction(payload, passLabel);
}
/**
* Common extraction logic
*/
async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
const response = await fetch(`${OLLAMA_URL}/api/generate`, { const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
@@ -168,7 +114,7 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
let fullText = ''; let fullText = '';
let lineBuffer = ''; let lineBuffer = '';
console.log(`[${passLabel}] Extracting...`); console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
while (true) { while (true) {
const { done, value } = await reader.read(); const { done, value } = await reader.read();
@@ -184,7 +130,6 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
fullText += json.response; fullText += json.response;
lineBuffer += json.response; lineBuffer += json.response;
// Print complete lines
if (lineBuffer.includes('\n')) { if (lineBuffer.includes('\n')) {
const parts = lineBuffer.split('\n'); const parts = lineBuffer.split('\n');
for (let i = 0; i < parts.length - 1; i++) { for (let i = 0; i < parts.length - 1; i++) {
@@ -214,6 +159,140 @@ async function doExtraction(payload: object, passLabel: string): Promise<ITransa
return JSON.parse(fullText.substring(startIdx, endIdx)); return JSON.parse(fullText.substring(startIdx, endIdx));
} }
/**
* Extract table using PaddleOCR-VL via OpenAI-compatible API
*/
async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
const payload = {
model: PADDLEOCR_VL_MODEL,
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: `data:image/png;base64,${imageBase64}` },
},
{
type: 'text',
text: PADDLEOCR_VL_TABLE_PROMPT,
},
],
},
],
temperature: 0.0,
max_tokens: 8192,
};
const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
const text = await response.text();
throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
}
const data = await response.json();
return data.choices?.[0]?.message?.content || '';
}
/**
* Convert PaddleOCR-VL table output to transactions using MiniCPM-V
*/
async function convertTableToTransactions(
tableData: string,
passLabel: string
): Promise<ITransaction[]> {
const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
const payload = {
model: MINICPM_MODEL,
prompt,
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let fullText = '';
console.log(`[${passLabel}] Converting table data to JSON...`);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const json = JSON.parse(line);
if (json.response) {
fullText += json.response;
}
} catch {
// Skip invalid JSON lines
}
}
}
const startIdx = fullText.indexOf('[');
const endIdx = fullText.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error('No JSON array found in response');
}
return JSON.parse(fullText.substring(startIdx, endIdx));
}
/**
* Extract using PaddleOCR-VL (table recognition) + conversion
*/
async function extractWithPaddleOCRVL(
images: string[],
passLabel: string
): Promise<ITransaction[]> {
console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
// Extract table data from each page
const tableDataParts: string[] = [];
for (let i = 0; i < images.length; i++) {
console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
const tableData = await extractTableWithPaddleOCRVL(images[i]);
if (tableData.trim()) {
tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
}
}
const combinedTableData = tableDataParts.join('\n\n');
console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
// Convert to transactions
return convertTableToTransactions(combinedTableData, passLabel);
}
/** /**
* Create a hash of transactions for comparison * Create a hash of transactions for comparison
*/ */
@@ -225,10 +304,31 @@ function hashTransactions(transactions: ITransaction[]): string {
} }
/** /**
* Extract with majority voting - run until 2 passes match * Check if PaddleOCR-VL service is available
* Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
*/ */
async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> { async function isPaddleOCRVLAvailable(): Promise<boolean> {
try {
const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
method: 'GET',
signal: AbortSignal.timeout(5000),
});
return response.ok;
} catch {
return false;
}
}
/**
* Extract with dual-VLM consensus
* Strategy:
* Pass 1 = MiniCPM-V visual extraction
* Pass 2 = PaddleOCR-VL table recognition (if available)
* Pass 3+ = MiniCPM-V visual (fallback)
*/
async function extractWithConsensus(
images: string[],
maxPasses: number = 5
): Promise<ITransaction[]> {
const results: Array<{ transactions: ITransaction[]; hash: string }> = []; const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
const hashCounts: Map<string, number> = new Map(); const hashCounts: Map<string, number> = new Map();
@@ -236,59 +336,48 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
const hash = hashTransactions(transactions); const hash = hashTransactions(transactions);
results.push({ transactions, hash }); results.push({ transactions, hash });
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`); console.log(
`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
);
return hashCounts.get(hash)!; return hashCounts.get(hash)!;
}; };
// Run Pass 1 (Visual) in parallel with OCR extraction // Check if PaddleOCR-VL is available
let ocrText = ''; const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err })); if (paddleOCRVLAvailable) {
console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
// Extract OCR from all pages
const ocrPromise = (async () => {
const ocrTexts: string[] = [];
for (let i = 0; i < images.length; i++) {
const pageOcr = await extractOcrText(images[i]);
if (pageOcr) {
ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
}
}
ocrText = ocrTexts.join('\n\n');
if (ocrText) {
console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
}
return ocrText;
})();
// Wait for Pass 1 and OCR to complete
const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
// Process Pass 1 result
if ('error' in pass1Result) {
console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
} else { } else {
addResult(pass1Result as ITransaction[], 'Pass 1 Visual'); console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
} }
// Pass 2: OCR-only (no images) - faster, different approach // Pass 1: MiniCPM-V visual extraction
if (ocrText) { try {
const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
addResult(pass1Result, 'Pass 1 MiniCPM-V');
} catch (err) {
console.log(`[Pass 1] Error: ${err}`);
}
// Pass 2: PaddleOCR-VL table recognition (if available)
if (paddleOCRVLAvailable) {
try { try {
const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only'); const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
const count = addResult(pass2Result, 'Pass 2 OCR-only'); const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
if (count >= 2) { if (count >= 2) {
console.log(`[Consensus] Visual and OCR extractions match!`); console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
return pass2Result; return pass2Result;
} }
} catch (err) { } catch (err) {
console.log(`[Pass 2 OCR-only] Error: ${err}`); console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
} }
} }
// Continue with visual passes 3+ if no consensus yet // Pass 3+: Continue with MiniCPM-V visual passes
for (let pass = 3; pass <= maxPasses; pass++) { const startPass = paddleOCRVLAvailable ? 3 : 2;
for (let pass = startPass; pass <= maxPasses; pass++) {
try { try {
const transactions = await extractVisual(images, `Pass ${pass} Visual`); const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
const count = addResult(transactions, `Pass ${pass} Visual`); const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
if (count >= 2) { if (count >= 2) {
console.log(`[Consensus] Reached after ${pass} passes`); console.log(`[Consensus] Reached after ${pass} passes`);
@@ -368,7 +457,7 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
} }
const files = fs.readdirSync(testDir); const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f) => f.endsWith('.pdf')); const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = []; const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) { for (const pdf of pdfFiles) {
@@ -402,6 +491,13 @@ tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue(); expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
}); });
tap.test('should check PaddleOCR-VL availability', async () => {
const available = await isPaddleOCRVLAvailable();
console.log(`PaddleOCR-VL available: ${available}`);
// This test passes regardless - PaddleOCR-VL is optional
expect(true).toBeTrue();
});
// Dynamic test for each PDF/JSON pair // Dynamic test for each PDF/JSON pair
const testCases = findTestCases(); const testCases = findTestCases();
for (const testCase of testCases) { for (const testCase of testCases) {
@@ -416,7 +512,7 @@ for (const testCase of testCases) {
const images = convertPdfToImages(testCase.pdfPath); const images = convertPdfToImages(testCase.pdfPath);
console.log(`Converted: ${images.length} pages\n`); console.log(`Converted: ${images.length} pages\n`);
// Extract with consensus voting // Extract with dual-VLM consensus
const extracted = await extractWithConsensus(images); const extracted = await extractWithConsensus(images);
console.log(`\nFinal: ${extracted.length} transactions`); console.log(`\nFinal: ${extracted.length} transactions`);

View File

@@ -1,258 +0,0 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
const PADDLEOCR_URL = 'http://localhost:5000';
interface IOCRResult {
text: string;
confidence: number;
box: number[][];
}
interface IOCRResponse {
success: boolean;
results: IOCRResult[];
error?: string;
}
interface IHealthResponse {
status: string;
model: string;
language: string;
gpu_enabled: boolean;
}
/**
* Convert PDF first page to PNG using ImageMagick
*/
function convertPdfToImage(pdfPath: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPath = path.join(tempDir, 'page.png');
try {
execSync(
`convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Create a simple test image with text using ImageMagick
*/
function createTestImage(text: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
const outputPath = path.join(tempDir, 'test.png');
try {
execSync(
`convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
// Health check test
tap.test('should respond to health check', async () => {
const response = await fetch(`${PADDLEOCR_URL}/health`);
expect(response.ok).toBeTrue();
const data: IHealthResponse = await response.json();
expect(data.status).toEqual('healthy');
expect(data.model).toEqual('PP-OCRv4');
expect(data.language).toBeTypeofString();
expect(data.gpu_enabled).toBeTypeofBoolean();
console.log(`PaddleOCR Status: ${data.status}`);
console.log(` Model: ${data.model}`);
console.log(` Language: ${data.language}`);
console.log(` GPU Enabled: ${data.gpu_enabled}`);
});
// Base64 OCR test
tap.test('should perform OCR on base64 image', async () => {
// Create a test image with known text
const testText = 'Hello World 12345';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
// Check that at least some of the expected text was found
const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
const hasPartialMatch =
normalizedExtracted.includes('hello') ||
normalizedExtracted.includes('world') ||
normalizedExtracted.includes('12345');
expect(hasPartialMatch).toBeTrue();
});
// File upload OCR test
tap.test('should perform OCR via file upload', async () => {
const testText = 'Invoice Number 98765';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const imageBuffer = Buffer.from(imageBase64, 'base64');
const formData = new FormData();
const blob = new Blob([imageBuffer], { type: 'image/png' });
formData.append('img', blob, 'test.png');
const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
method: 'POST',
body: formData,
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
});
// OCR result structure test
tap.test('should return proper OCR result structure', async () => {
const testText = 'Test 123';
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const data: IOCRResponse = await response.json();
if (data.results.length > 0) {
const result = data.results[0];
// Check result has required fields
expect(result.text).toBeTypeofString();
expect(result.confidence).toBeTypeofNumber();
expect(result.box).toBeArray();
// Check bounding box structure (4 points, each with x,y)
expect(result.box.length).toEqual(4);
for (const point of result.box) {
expect(point.length).toEqual(2);
expect(point[0]).toBeTypeofNumber();
expect(point[1]).toBeTypeofNumber();
}
// Confidence should be between 0 and 1
expect(result.confidence).toBeGreaterThan(0);
expect(result.confidence).toBeLessThanOrEqual(1);
console.log(`Result structure valid:`);
console.log(` Text: "${result.text}"`);
console.log(` Confidence: ${(result.confidence * 100).toFixed(1)}%`);
console.log(` Box: ${JSON.stringify(result.box)}`);
}
});
// Test with actual invoice if available
const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
if (fs.existsSync(invoiceDir)) {
const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
if (pdfFiles.length > 0) {
const testPdf = pdfFiles[0];
tap.test(`should extract text from invoice: ${testPdf}`, async () => {
const pdfPath = path.join(invoiceDir, testPdf);
console.log(`Converting ${testPdf} to image...`);
const imageBase64 = convertPdfToImage(pdfPath);
console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
const startTime = Date.now();
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const endTime = Date.now();
const elapsedMs = endTime - startTime;
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
console.log(`Found ${data.results.length} text regions`);
// Print first 10 results
const preview = data.results.slice(0, 10);
console.log(`\nFirst ${preview.length} results:`);
for (const result of preview) {
console.log(` [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
}
if (data.results.length > 10) {
console.log(` ... and ${data.results.length - 10} more`);
}
// Should find text in an invoice
expect(data.results.length).toBeGreaterThan(5);
});
}
}
// Error handling test
tap.test('should handle invalid base64 gracefully', async () => {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: 'not-valid-base64!!!' }),
});
const data: IOCRResponse = await response.json();
// Should return success: false with error message
expect(data.success).toBeFalse();
expect(data.error).toBeTypeofString();
console.log(`Error handling works: ${data.error}`);
});
export default tap.start();