5 Commits

Author SHA1 Message Date
addae20cbd v1.5.0
Some checks failed
Docker (tags) / security (push) Successful in 31s
Docker (tags) / test (push) Failing after 40s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-17 16:57:26 +00:00
0482c35b69 feat(paddleocr-vl): add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests 2026-01-17 16:57:26 +00:00
15ac1fcf67 update 2026-01-16 16:21:44 +00:00
3c5cf578a5 v1.4.0
Some checks failed
Docker (tags) / security (push) Successful in 28s
Docker (tags) / test (push) Failing after 54s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-16 14:24:37 +00:00
82358b2d5d feat(invoices): add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors 2026-01-16 14:24:37 +00:00
20 changed files with 1547 additions and 1094 deletions

View File

@@ -1,49 +0,0 @@
# PaddleOCR GPU Variant
# OCR processing with NVIDIA GPU support using PaddlePaddle
FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV OCR_LANGUAGE="en"
ENV SERVER_PORT="5000"
ENV SERVER_HOST="0.0.0.0"
ENV PYTHONUNBUFFERED=1
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies (using stable paddleocr 2.x)
RUN pip install --no-cache-dir \
paddleocr==2.8.1 \
fastapi \
uvicorn[standard] \
python-multipart \
opencv-python-headless \
pillow
# Copy server files
COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Note: OCR models will be downloaded on first run
# This ensures compatibility across different GPU architectures
# Expose API port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]

View File

@@ -1,53 +0,0 @@
# PaddleOCR CPU Variant
# OCR processing optimized for CPU-only inference
FROM python:3.10-slim-bookworm
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration for CPU-only mode
ENV OCR_LANGUAGE="en"
ENV SERVER_PORT="5000"
ENV SERVER_HOST="0.0.0.0"
ENV PYTHONUNBUFFERED=1
# Disable GPU usage for CPU-only variant
ENV CUDA_VISIBLE_DEVICES="-1"
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
RUN pip install --no-cache-dir \
paddlepaddle==2.6.2 \
paddleocr==2.8.1 \
fastapi \
uvicorn[standard] \
python-multipart \
opencv-python-headless \
pillow
# Copy server files
COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Note: OCR models will be downloaded on first run
# This avoids build-time segfaults with certain CPU architectures
# Expose API port
EXPOSE 5000
# Health check (longer start-period for CPU variant)
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-entrypoint.sh"]

70
Dockerfile_paddleocr_vl Normal file
View File

@@ -0,0 +1,70 @@
# PaddleOCR-VL GPU Variant
# Vision-Language Model for document parsing using vLLM
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3.11-dev \
python3-pip \
git \
curl \
build-essential \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
# Create and activate virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 \
torchvision \
--index-url https://download.pytorch.org/whl/cu124
# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
RUN pip install --no-cache-dir \
vllm==0.11.1 \
--extra-index-url https://download.pytorch.org/whl/cu124
# Install additional dependencies
RUN pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
openai \
httpx
# Copy entrypoint script
COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
# Expose vLLM API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]

View File

@@ -0,0 +1,57 @@
# PaddleOCR-VL CPU Variant
# Vision-Language Model for document parsing using transformers (slower, no GPU required)
FROM python:3.11-slim-bookworm
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B CPU - Vision-Language Model for document parsing"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV CUDA_VISIBLE_DEVICES=""
ENV SERVER_PORT=8000
ENV SERVER_HOST=0.0.0.0
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
git \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
httpx \
protobuf \
sentencepiece \
einops
# Copy server files
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
# Expose API port
EXPOSE 8000
# Health check (longer start-period for CPU + model download)
HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-cpu-entrypoint.sh"]

View File

@@ -0,0 +1,71 @@
# PaddleOCR-VL GPU Variant (Transformers-based, not vLLM)
# Vision-Language Model for document parsing using transformers with CUDA
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR-VL 0.9B GPU - Vision-Language Model using transformers"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/root/.cache/huggingface
ENV SERVER_PORT=8000
ENV SERVER_HOST=0.0.0.0
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3.11-dev \
python3-pip \
libgl1-mesa-glx \
libglib2.0-0 \
libgomp1 \
curl \
git \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
# Create and activate virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
torch==2.5.1 \
torchvision \
--index-url https://download.pytorch.org/whl/cu124
# Install Python dependencies (transformers-based, not vLLM)
RUN pip install --no-cache-dir \
transformers \
accelerate \
safetensors \
pillow \
fastapi \
uvicorn[standard] \
python-multipart \
httpx \
protobuf \
sentencepiece \
einops
# Copy server files (same as CPU variant - it auto-detects CUDA)
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]

View File

@@ -29,19 +29,19 @@ docker build \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
. .
# Build PaddleOCR GPU variant # Build PaddleOCR-VL GPU variant (vLLM)
echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}" echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
docker build \ docker build \
-f Dockerfile_paddleocr \ -f Dockerfile_paddleocr_vl \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
. .
# Build PaddleOCR CPU variant # Build PaddleOCR-VL CPU variant
echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}" echo -e "${GREEN}Building PaddleOCR-VL CPU variant...${NC}"
docker build \ docker build \
-f Dockerfile_paddleocr_cpu \ -f Dockerfile_paddleocr_vl_cpu \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu \
. .
echo -e "${GREEN}All images built successfully!${NC}" echo -e "${GREEN}All images built successfully!${NC}"
@@ -52,7 +52,7 @@ echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
echo "" echo ""
echo " PaddleOCR:" echo " PaddleOCR-VL (Vision-Language Model):"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl (GPU/vLLM)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu (GPU/vLLM)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-cpu (CPU)"

View File

@@ -1,5 +1,24 @@
# Changelog # Changelog
## 2026-01-17 - 1.5.0 - feat(paddleocr-vl)
add PaddleOCR-VL GPU Dockerfile, pin vllm, update CPU image deps, and improve entrypoint and tests
- Add a new GPU Dockerfile for PaddleOCR-VL (transformers-based) with CUDA support, healthcheck, and entrypoint.
- Pin vllm to 0.11.1 in Dockerfile_paddleocr_vl to use the first stable release with PaddleOCR-VL support.
- Update CPU image: add torchvision==0.20.1 and extra Python deps (protobuf, sentencepiece, einops) required by the transformers-based server.
- Rewrite paddleocr-vl-entrypoint.sh to build vllm args array, add MAX_MODEL_LEN and ENFORCE_EAGER env vars, include --limit-mm-per-prompt and optional --enforce-eager, and switch to exec vllm with constructed args.
- Update tests to use the OpenAI-compatible PaddleOCR-VL chat completions API (/v1/chat/completions) with image+text message payload and model 'paddleocr-vl'.
- Add @types/node to package.json dependencies and tidy devDependencies ordering.
## 2026-01-16 - 1.4.0 - feat(invoices)
add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors
- Add hybrid pipeline documentation and examples (PaddleOCR + MiniCPM-V) and architecture diagram in recipes/document.md
- Integrate PaddleOCR: new OCR extraction functions and OCR-only prompt flow in test/test.node.ts
- Add consensus voting and parallel-pass optimization to improve reliability (multiple passes, hashing, and majority voting)
- Refactor prompts and tests: introduce /nothink token, OCR truncation limits, separate visual and OCR-only prompts, and improved prompt building in test/test.invoices.ts
- Update image conversion defaults (200 DPI, filename change) and add TypeScript helper functions for extraction and consensus handling
## 2026-01-16 - 1.3.0 - feat(paddleocr) ## 2026-01-16 - 1.3.0 - feat(paddleocr)
add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows

View File

@@ -1,25 +0,0 @@
#!/bin/bash
set -e
# Configuration from environment
OCR_LANGUAGE="${OCR_LANGUAGE:-en}"
SERVER_PORT="${SERVER_PORT:-5000}"
SERVER_HOST="${SERVER_HOST:-0.0.0.0}"
echo "Starting PaddleOCR Server..."
echo " Language: ${OCR_LANGUAGE}"
echo " Host: ${SERVER_HOST}"
echo " Port: ${SERVER_PORT}"
# Check GPU availability
if [ "${CUDA_VISIBLE_DEVICES}" = "-1" ]; then
echo " GPU: Disabled (CPU mode)"
else
echo " GPU: Enabled"
fi
# Start the FastAPI server with uvicorn
exec python -m uvicorn paddleocr_server:app \
--host "${SERVER_HOST}" \
--port "${SERVER_PORT}" \
--workers 1

View File

@@ -0,0 +1,19 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server (CPU)"
echo "==================================="
HOST="${SERVER_HOST:-0.0.0.0}"
PORT="${SERVER_PORT:-8000}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Device: CPU (no GPU)"
echo ""
echo "Starting PaddleOCR-VL CPU server..."
echo "==================================="
exec python /app/paddleocr_vl_server.py

View File

@@ -0,0 +1,59 @@
#!/bin/bash
set -e
echo "==================================="
echo "PaddleOCR-VL Server"
echo "==================================="
# Configuration
MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
echo "Model: ${MODEL_NAME}"
echo "Host: ${HOST}"
echo "Port: ${PORT}"
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
echo "Max model length: ${MAX_MODEL_LEN}"
echo "Enforce eager: ${ENFORCE_EAGER}"
echo ""
# Check GPU availability
if command -v nvidia-smi &> /dev/null; then
echo "GPU Information:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
echo ""
else
echo "WARNING: nvidia-smi not found. GPU may not be available."
fi
echo "Starting vLLM server..."
echo "==================================="
# Build vLLM command
VLLM_ARGS=(
serve "${MODEL_NAME}"
--trust-remote-code
--host "${HOST}"
--port "${PORT}"
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
--max-model-len "${MAX_MODEL_LEN}"
--no-enable-prefix-caching
--mm-processor-cache-gb 0
--served-model-name "paddleocr-vl"
--limit-mm-per-prompt '{"image": 1}'
)
# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
if [ "${ENFORCE_EAGER}" = "true" ]; then
VLLM_ARGS+=(--enforce-eager)
fi
# Start vLLM server with PaddleOCR-VL
exec vllm "${VLLM_ARGS[@]}"

View File

@@ -1,253 +0,0 @@
#!/usr/bin/env python3
"""
PaddleOCR FastAPI Server
Provides REST API for OCR operations using PaddleOCR
"""
import os
import io
import base64
import logging
from typing import Optional, List, Any
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR Server",
description="REST API for OCR operations using PaddleOCR PP-OCRv4",
version="1.0.0"
)
# Global OCR instance
ocr_instance: Optional[PaddleOCR] = None
class OCRRequest(BaseModel):
"""Request model for base64 image OCR"""
image: str
language: Optional[str] = None
class BoundingBox(BaseModel):
"""Bounding box for detected text"""
points: List[List[float]]
class OCRResult(BaseModel):
"""Single OCR detection result"""
text: str
confidence: float
box: List[List[float]]
class OCRResponse(BaseModel):
"""OCR response model"""
success: bool
results: List[OCRResult]
error: Optional[str] = None
class HealthResponse(BaseModel):
"""Health check response"""
status: str
model: str
language: str
gpu_enabled: bool
def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
"""Get or initialize the OCR instance"""
global ocr_instance
use_lang = lang or OCR_LANGUAGE
# Return cached instance if same language
if ocr_instance is not None and lang is None:
return ocr_instance
logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
new_ocr = PaddleOCR(
use_angle_cls=True,
lang=use_lang,
use_gpu=USE_GPU,
show_log=False
)
# Cache the default language instance
if lang is None:
ocr_instance = new_ocr
logger.info("PaddleOCR initialized successfully")
return new_ocr
def decode_base64_image(base64_string: str) -> np.ndarray:
"""Decode base64 string to numpy array"""
# Remove data URL prefix if present
if ',' in base64_string:
base64_string = base64_string.split(',')[1]
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
return np.array(image)
def process_ocr_result(result: Any) -> List[OCRResult]:
"""Process PaddleOCR result into structured format"""
results = []
if result is None or len(result) == 0:
return results
# PaddleOCR returns list of results per image
# Each result is a list of [box, (text, confidence)]
for line in result[0] if result[0] else []:
if line is None:
continue
box = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
text_info = line[1] # (text, confidence)
results.append(OCRResult(
text=text_info[0],
confidence=float(text_info[1]),
box=[[float(p[0]), float(p[1])] for p in box]
))
return results
@app.on_event("startup")
async def startup_event():
"""Pre-warm the OCR model on startup"""
logger.info("Pre-warming OCR model...")
try:
ocr = get_ocr()
# Create a small test image to warm up the model
test_image = np.zeros((100, 100, 3), dtype=np.uint8)
test_image.fill(255) # White image
ocr.ocr(test_image, cls=True)
logger.info("OCR model pre-warmed successfully")
except Exception as e:
logger.error(f"Failed to pre-warm OCR model: {e}")
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
try:
# Ensure OCR is initialized
get_ocr()
return HealthResponse(
status="healthy",
model="PP-OCRv4",
language=OCR_LANGUAGE,
gpu_enabled=USE_GPU
)
except Exception as e:
logger.error(f"Health check failed: {e}")
raise HTTPException(status_code=503, detail=str(e))
@app.post("/ocr", response_model=OCRResponse)
async def ocr_base64(request: OCRRequest):
"""
Perform OCR on a base64-encoded image
Args:
request: OCRRequest with base64 image and optional language
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Decode image
image = decode_base64_image(request.image)
# Get OCR instance (use request language if provided)
if request.language and request.language != OCR_LANGUAGE:
ocr = get_ocr(request.language)
else:
ocr = get_ocr()
result = ocr.ocr(image, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
@app.post("/ocr/upload", response_model=OCRResponse)
async def ocr_upload(
img: UploadFile = File(...),
language: Optional[str] = Form(None)
):
"""
Perform OCR on an uploaded image file
Args:
img: Uploaded image file
language: Optional language code (default: env OCR_LANGUAGE)
Returns:
OCRResponse with detected text, confidence scores, and bounding boxes
"""
try:
# Read image
contents = await img.read()
image = Image.open(io.BytesIO(contents))
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
image_array = np.array(image)
# Get OCR instance
if language and language != OCR_LANGUAGE:
ocr = get_ocr(language)
else:
ocr = get_ocr()
result = ocr.ocr(image_array, cls=True)
# Process results
results = process_ocr_result(result)
return OCRResponse(success=True, results=results)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
return OCRResponse(success=False, results=[], error=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=5000)

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python3
"""
PaddleOCR-VL FastAPI Server (CPU variant)
Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
"""
import os
import io
import base64
import logging
import time
from typing import Optional, List, Any, Dict, Union
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch
from PIL import Image
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
# Task prompts for PaddleOCR-VL
TASK_PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR-VL Server",
description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
version="1.0.0"
)
# Global model instances
model = None
processor = None
# Request/Response models (OpenAI-compatible)
class ImageUrl(BaseModel):
url: str
class ContentItem(BaseModel):
type: str
text: Optional[str] = None
image_url: Optional[ImageUrl] = None
class Message(BaseModel):
role: str
content: Union[str, List[ContentItem]]
class ChatCompletionRequest(BaseModel):
model: str = "paddleocr-vl"
messages: List[Message]
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 4096
class Choice(BaseModel):
index: int
message: Message
finish_reason: str
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
usage: Usage
class HealthResponse(BaseModel):
status: str
model: str
device: str
def load_model():
"""Load the PaddleOCR-VL model and processor"""
global model, processor
if model is not None:
return
logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
from transformers import AutoModelForCausalLM, AutoProcessor
# Load processor
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Load model with appropriate settings for CPU/GPU
if DEVICE == "cuda":
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
else:
# CPU mode - use float32 for compatibility
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
).eval()
logger.info("PaddleOCR-VL model loaded successfully")
def decode_image(image_source: str) -> Image.Image:
"""Decode image from URL or base64"""
if image_source.startswith("data:"):
# Base64 encoded image
header, data = image_source.split(",", 1)
image_data = base64.b64decode(data)
return Image.open(io.BytesIO(image_data)).convert("RGB")
elif image_source.startswith("http://") or image_source.startswith("https://"):
# URL - fetch image
import httpx
response = httpx.get(image_source, timeout=30.0)
response.raise_for_status()
return Image.open(io.BytesIO(response.content)).convert("RGB")
else:
# Assume it's a file path or raw base64
try:
image_data = base64.b64decode(image_source)
return Image.open(io.BytesIO(image_data)).convert("RGB")
except:
# Try as file path
return Image.open(image_source).convert("RGB")
def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
"""Extract image and text prompt from message content"""
if isinstance(content, str):
return None, content
image = None
text = ""
for item in content:
if item.type == "image_url" and item.image_url:
image = decode_image(item.image_url.url)
elif item.type == "text" and item.text:
text = item.text
return image, text
def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
"""Generate response using PaddleOCR-VL"""
load_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
if DEVICE == "cuda":
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
use_cache=True
)
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract the assistant's response (after the prompt)
if "assistant" in response.lower():
parts = response.split("assistant")
if len(parts) > 1:
response = parts[-1].strip()
return response
@app.on_event("startup")
async def startup_event():
"""Pre-load the model on startup"""
logger.info("Pre-loading PaddleOCR-VL model...")
try:
load_model()
logger.info("Model pre-loaded successfully")
except Exception as e:
logger.error(f"Failed to pre-load model: {e}")
# Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
return HealthResponse(
status="healthy" if model is not None else "loading",
model=MODEL_NAME,
device=DEVICE
)
@app.get("/v1/models")
async def list_models():
"""List available models (OpenAI-compatible)"""
return {
"object": "list",
"data": [
{
"id": "paddleocr-vl",
"object": "model",
"created": int(time.time()),
"owned_by": "paddlepaddle"
}
]
}
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
"""
OpenAI-compatible chat completions endpoint for PaddleOCR-VL
Supports tasks:
- "OCR:" - Text recognition
- "Table Recognition:" - Table extraction
- "Formula Recognition:" - Formula extraction
- "Chart Recognition:" - Chart extraction
"""
try:
# Get the last user message
user_message = None
for msg in reversed(request.messages):
if msg.role == "user":
user_message = msg
break
if not user_message:
raise HTTPException(status_code=400, detail="No user message found")
# Extract image and prompt
image, prompt = extract_image_and_text(user_message.content)
if image is None:
raise HTTPException(status_code=400, detail="No image provided in message")
# Default to OCR if no specific prompt
if not prompt or prompt.strip() == "":
prompt = "OCR:"
logger.info(f"Processing request with prompt: {prompt[:50]}...")
# Generate response
start_time = time.time()
response_text = generate_response(image, prompt, request.max_tokens or 4096)
elapsed = time.time() - start_time
logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
# Build OpenAI-compatible response
return ChatCompletionResponse(
id=f"chatcmpl-{int(time.time()*1000)}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=100, # Approximate
completion_tokens=len(response_text) // 4,
total_tokens=100 + len(response_text) // 4
)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error processing request: {e}")
raise HTTPException(status_code=500, detail=str(e))
# Legacy endpoint for compatibility with old PaddleOCR API
class LegacyOCRRequest(BaseModel):
image: str
task: Optional[str] = "ocr"
class LegacyOCRResponse(BaseModel):
success: bool
result: str
task: str
error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
async def legacy_ocr(request: LegacyOCRRequest):
"""
Legacy OCR endpoint for backwards compatibility
Tasks: ocr, table, formula, chart
"""
try:
image = decode_image(request.image)
prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
result = generate_response(image, prompt)
return LegacyOCRResponse(
success=True,
result=result,
task=request.task
)
except Exception as e:
logger.error(f"Legacy OCR error: {e}")
return LegacyOCRResponse(
success=False,
result="",
task=request.task,
error=str(e)
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

View File

@@ -1,6 +1,6 @@
{ {
"name": "@host.today/ht-docker-ai", "name": "@host.today/ht-docker-ai",
"version": "1.3.0", "version": "1.5.0",
"type": "module", "type": "module",
"private": false, "private": false,
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5", "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
@@ -13,8 +13,8 @@
"test": "tstest test/ --verbose" "test": "tstest test/ --verbose"
}, },
"devDependencies": { "devDependencies": {
"@git.zone/tstest": "^1.0.90", "@git.zone/tsrun": "^1.3.3",
"@git.zone/tsrun": "^1.3.3" "@git.zone/tstest": "^1.0.90"
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@@ -28,5 +28,8 @@
"minicpm", "minicpm",
"ollama", "ollama",
"multimodal" "multimodal"
] ],
"dependencies": {
"@types/node": "^25.0.9"
}
} }

4
pnpm-lock.yaml generated
View File

@@ -7,6 +7,10 @@ settings:
importers: importers:
.: .:
dependencies:
'@types/node':
specifier: ^25.0.9
version: 25.0.9
devDependencies: devDependencies:
'@git.zone/tsrun': '@git.zone/tsrun':
specifier: ^1.3.3 specifier: ^1.3.3

View File

@@ -77,56 +77,73 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CPU variant has longer `start-period` (120s) due to slower startup. CPU variant has longer `start-period` (120s) due to slower startup.
## PaddleOCR ## PaddleOCR-VL (Recommended)
### Overview ### Overview
PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides: PaddleOCR-VL is a 0.9B parameter Vision-Language Model specifically optimized for document parsing. It replaces the older PP-Structure approach with native VLM understanding.
- Text detection and recognition **Key advantages over PP-Structure:**
- Multi-language support - Native table understanding (no HTML parsing needed)
- FastAPI REST API - 109 language support
- GPU and CPU variants - Better handling of complex multi-row tables
- Structured Markdown/JSON output
### Docker Images ### Docker Images
| Tag | Description | | Tag | Description |
|-----|-------------| |-----|-------------|
| `paddleocr` | GPU variant (default) | | `paddleocr-vl` | GPU variant using vLLM (recommended) |
| `paddleocr-gpu` | GPU variant (alias) | | `paddleocr-vl-cpu` | CPU variant using transformers |
| `paddleocr-cpu` | CPU-only variant |
### API Endpoints ### API Endpoints (OpenAI-compatible)
| Endpoint | Method | Description | | Endpoint | Method | Description |
|----------|--------|-------------| |----------|--------|-------------|
| `/health` | GET | Health check with model info | | `/health` | GET | Health check with model info |
| `/ocr` | POST | OCR with base64 image (JSON body) | | `/v1/models` | GET | List available models |
| `/ocr/upload` | POST | OCR with file upload (multipart form) | | `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
| `/ocr` | POST | Legacy OCR endpoint |
### Request/Response Format ### Request/Response Format
**POST /ocr (JSON)** **POST /v1/chat/completions (OpenAI-compatible)**
```json ```json
{ {
"image": "<base64-encoded-image>", "model": "paddleocr-vl",
"language": "en" // optional "messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
{"type": "text", "text": "Table Recognition:"}
]
}
],
"temperature": 0.0,
"max_tokens": 8192
} }
``` ```
**POST /ocr/upload (multipart)** **Task Prompts:**
- `img`: image file - `"OCR:"` - Text recognition
- `language`: optional language code - `"Table Recognition:"` - Table extraction (returns markdown)
- `"Formula Recognition:"` - Formula extraction
- `"Chart Recognition:"` - Chart extraction
**Response** **Response**
```json ```json
{ {
"success": true, "id": "chatcmpl-...",
"results": [ "object": "chat.completion",
"choices": [
{ {
"text": "Invoice #12345", "index": 0,
"confidence": 0.98, "message": {
"box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] "role": "assistant",
"content": "| Date | Description | Amount |\n|---|---|---|\n| 2021-06-01 | GITLAB INC | -119.96 |"
},
"finish_reason": "stop"
} }
] ]
} }
@@ -136,19 +153,16 @@ PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It pr
| Variable | Default | Description | | Variable | Default | Description |
|----------|---------|-------------| |----------|---------|-------------|
| `OCR_LANGUAGE` | `en` | Default language for OCR | | `MODEL_NAME` | `PaddlePaddle/PaddleOCR-VL` | Model to load |
| `SERVER_PORT` | `5000` | Server port | | `HOST` | `0.0.0.0` | Server host |
| `SERVER_HOST` | `0.0.0.0` | Server host | | `PORT` | `8000` | Server port |
| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only | | `MAX_BATCHED_TOKENS` | `16384` | vLLM max batch tokens |
| `GPU_MEMORY_UTILIZATION` | `0.9` | GPU memory usage (0-1) |
### Performance ### Performance
- **GPU**: ~1-3 seconds per page - **GPU (vLLM)**: ~2-5 seconds per page
- **CPU**: ~10-30 seconds per page - **CPU**: ~30-60 seconds per page
### Supported Languages
Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
--- ---
@@ -193,6 +207,43 @@ npmci docker build
npmci docker push code.foss.global npmci docker push code.foss.global
``` ```
## Multi-Pass Extraction Strategy
The bank statement extraction uses a dual-VLM consensus approach:
### Architecture: Dual-VLM Consensus
| VLM | Model | Purpose |
|-----|-------|---------|
| **MiniCPM-V 4.5** | 8B params | Primary visual extraction |
| **PaddleOCR-VL** | 0.9B params | Table-specialized extraction |
### Extraction Strategy
1. **Pass 1**: MiniCPM-V visual extraction (images → JSON)
2. **Pass 2**: PaddleOCR-VL table recognition (images → markdown → JSON)
3. **Consensus**: If Pass 1 == Pass 2 → Done (fast path)
4. **Pass 3+**: MiniCPM-V visual if no consensus
### Why Dual-VLM Works
- **Different architectures**: Two independent models cross-check each other
- **Specialized strengths**: PaddleOCR-VL optimized for tables, MiniCPM-V for general vision
- **No structure loss**: Both VLMs see the original images directly
- **Fast consensus**: Most documents complete in 2 passes when VLMs agree
### Comparison vs Old PP-Structure Approach
| Approach | Bank Statement Result | Issue |
|----------|----------------------|-------|
| MiniCPM-V Visual | 28 transactions ✓ | - |
| PP-Structure HTML + Visual | 13 transactions ✗ | HTML merged rows incorrectly |
| PaddleOCR-VL Table | 28 transactions ✓ | Native table understanding |
**Key insight**: PP-Structure's HTML output loses structure for complex tables. PaddleOCR-VL's native VLM approach maintains table integrity.
---
## Related Resources ## Related Resources
- [Ollama Documentation](https://ollama.ai/docs) - [Ollama Documentation](https://ollama.ai/docs)

View File

@@ -1,129 +1,250 @@
# Bank Statement Parsing with MiniCPM-V 4.5 # Document Recognition with Hybrid OCR + Vision AI
Recipe for extracting transactions from bank statement PDFs using vision-language AI. Recipe for extracting structured data from invoices and documents using a hybrid approach:
PaddleOCR for text extraction + MiniCPM-V 4.5 for intelligent parsing.
## Model ## Architecture
- **Model**: MiniCPM-V 4.5 (8B parameters) ```
- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0` ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
- **Quantization**: Q8_0 (9.8GB VRAM) │ PDF/Image │ ───> │ PaddleOCR │ ───> │ Raw Text │
- **Runtime**: Ollama on GPU └──────────────┘ └──────────────┘ └──────┬───────┘
┌──────────────┐ │
│ MiniCPM-V │ <───────────┘
│ 4.5 VLM │ <─── Image
└──────┬───────┘
┌──────▼───────┐
│ Structured │
│ JSON │
└──────────────┘
```
## Why Hybrid?
| Approach | Accuracy | Speed | Best For |
|----------|----------|-------|----------|
| VLM Only | 85-90% | Fast | Simple layouts |
| OCR Only | N/A | Fast | Just text extraction |
| **Hybrid** | **91%+** | Medium | Complex invoices |
The hybrid approach provides OCR text as context to the VLM, improving accuracy on:
- Small text and numbers
- Low contrast documents
- Dense tables
## Services
| Service | Port | Purpose |
|---------|------|---------|
| PaddleOCR | 5000 | Text extraction |
| Ollama (MiniCPM-V) | 11434 | Intelligent parsing |
## Running the Containers
**Start both services:**
```bash
# PaddleOCR (CPU is sufficient for OCR)
docker run -d --name paddleocr -p 5000:5000 \
code.foss.global/host.today/ht-docker-ai:paddleocr-cpu
# MiniCPM-V 4.5 (GPU recommended)
docker run -d --name minicpm --gpus all -p 11434:11434 \
-v ollama-data:/root/.ollama \
code.foss.global/host.today/ht-docker-ai:minicpm45v
```
## Image Conversion ## Image Conversion
Convert PDF to PNG at 300 DPI for optimal OCR accuracy. Convert PDF to PNG at 200 DPI:
```bash ```bash
convert -density 300 -quality 100 input.pdf \ convert -density 200 -quality 90 input.pdf \
-background white -alpha remove \ -background white -alpha remove \
output-%d.png page-%d.png
``` ```
**Parameters:** ## Step 1: Extract OCR Text
- `-density 300`: 300 DPI resolution (critical for accuracy)
- `-quality 100`: Maximum quality
- `-background white -alpha remove`: Remove transparency
- `output-%d.png`: Outputs page-0.png, page-1.png, etc.
**Dependencies:** ```typescript
```bash async function extractOcrText(imageBase64: string): Promise<string> {
apt-get install imagemagick const response = await fetch('http://localhost:5000/ocr', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const data = await response.json();
if (data.success && data.results) {
return data.results.map((r: { text: string }) => r.text).join('\n');
}
return '';
}
``` ```
## Prompt ## Step 2: Build Enhanced Prompt
``` ```typescript
You are a bank statement parser. Extract EVERY transaction from the table. function buildPrompt(ocrText: string): string {
const base = `You are an invoice parser. Extract the following fields:
Read the Amount column carefully: 1. invoice_number: The invoice/receipt number
- "- 21,47 €" means DEBIT, output as: -21.47 2. invoice_date: Date in YYYY-MM-DD format
- "+ 1.000,00 €" means CREDIT, output as: 1000.00 3. vendor_name: Company that issued the invoice
- European format: comma = decimal point 4. currency: EUR, USD, etc.
5. net_amount: Amount before tax (if shown)
6. vat_amount: Tax/VAT amount (0 if reverse charge)
7. total_amount: Final amount due
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47} Return ONLY valid JSON:
{"invoice_number":"XXX","invoice_date":"YYYY-MM-DD","vendor_name":"Company","currency":"EUR","net_amount":100.00,"vat_amount":19.00,"total_amount":119.00}`;
Do not skip any rows. Return complete JSON array: if (ocrText) {
return `${base}
OCR text extracted from the invoice:
---
${ocrText}
---
Cross-reference the image with the OCR text above for accuracy.`;
}
return base;
}
``` ```
## API Call ## Step 3: Call Vision-Language Model
```python ```typescript
import base64 async function extractInvoice(images: string[], ocrText: string): Promise<Invoice> {
import requests const payload = {
model: 'openbmb/minicpm-v4.5:q8_0',
prompt: buildPrompt(ocrText),
images, // Base64 encoded
stream: false,
options: {
num_predict: 2048,
temperature: 0.1,
},
};
# Load images const response = await fetch('http://localhost:11434/api/generate', {
with open('page-0.png', 'rb') as f: method: 'POST',
page0 = base64.b64encode(f.read()).decode('utf-8') headers: { 'Content-Type': 'application/json' },
with open('page-1.png', 'rb') as f: body: JSON.stringify(payload),
page1 = base64.b64encode(f.read()).decode('utf-8') });
payload = { const result = await response.json();
"model": "openbmb/minicpm-v4.5:q8_0", return JSON.parse(result.response);
"prompt": prompt, }
"images": [page0, page1], # Multiple pages supported ```
"stream": False,
"options": { ## Consensus Voting
"num_predict": 16384,
"temperature": 0.1 For production reliability, run multiple extraction passes and require consensus:
```typescript
async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<Invoice> {
const results: Map<string, { invoice: Invoice; count: number }> = new Map();
// Optimization: Run Pass 1 (no OCR) parallel with OCR + Pass 2
const [pass1Result, ocrText] = await Promise.all([
extractInvoice(images, ''),
extractOcrText(images[0]),
]);
// Add Pass 1 result
addResult(results, pass1Result);
// Pass 2 with OCR context
const pass2Result = await extractInvoice(images, ocrText);
addResult(results, pass2Result);
// Check for consensus (2 matching results)
for (const [hash, data] of results) {
if (data.count >= 2) {
return data.invoice; // Consensus reached!
} }
}
// Continue until consensus or max passes
for (let pass = 3; pass <= maxPasses; pass++) {
const result = await extractInvoice(images, ocrText);
addResult(results, result);
// Check consensus...
}
// Return most common result
return getMostCommon(results);
} }
response = requests.post( function hashInvoice(inv: Invoice): string {
'http://localhost:11434/api/generate', return `${inv.invoice_number}|${inv.invoice_date}|${inv.total_amount.toFixed(2)}`;
json=payload, }
timeout=600
)
result = response.json()['response']
``` ```
## Output Format ## Output Format
```json ```json
[ {
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47}, "invoice_number": "INV-2024-001234",
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06}, "invoice_date": "2024-08-15",
{"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00} "vendor_name": "Hetzner Online GmbH",
] "currency": "EUR",
"net_amount": 167.52,
"vat_amount": 31.83,
"total_amount": 199.35
}
``` ```
## Running the Container
**GPU (recommended):**
```bash
docker run -d --gpus all -p 11434:11434 \
-v ollama-data:/root/.ollama \
-e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
ht-docker-ai:minicpm45v
```
**CPU (slower):**
```bash
docker run -d -p 11434:11434 \
-v ollama-data:/root/.ollama \
-e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
ht-docker-ai:minicpm45v-cpu
```
## Hardware Requirements
| Quantization | VRAM/RAM | Speed |
|--------------|----------|-------|
| Q8_0 (GPU) | 10GB | Fast |
| Q4_0 (CPU) | 8GB | Slow |
## Test Results ## Test Results
| Statement | Pages | Transactions | Accuracy | Tested on 46 real invoices from various vendors:
|-----------|-------|--------------|----------|
| bunq-2022-04 | 2 | 26 | 100% | | Metric | Value |
| bunq-2021-06 | 3 | 28 | 100% | |--------|-------|
| **Accuracy** | 91.3% (42/46) |
| **Avg Time** | 42.7s per invoice |
| **Consensus Rate** | 85% in 2 passes |
### Per-Vendor Results
| Vendor | Invoices | Accuracy |
|--------|----------|----------|
| Hetzner | 3 | 100% |
| DigitalOcean | 4 | 100% |
| Adobe | 3 | 100% |
| Cloudflare | 1 | 100% |
| Wasabi | 4 | 100% |
| Figma | 3 | 100% |
| Google Cloud | 1 | 100% |
| MongoDB | 3 | 0% (date parsing) |
## Hardware Requirements
| Component | Minimum | Recommended |
|-----------|---------|-------------|
| PaddleOCR (CPU) | 4GB RAM | 8GB RAM |
| MiniCPM-V (GPU) | 10GB VRAM | 12GB VRAM |
| MiniCPM-V (CPU) | 16GB RAM | 32GB RAM |
## Tips ## Tips
1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal 1. **Use hybrid approach**: OCR text dramatically improves number/date accuracy
2. **PNG over JPEG**: PNG preserves text clarity better 2. **Consensus voting**: Run 2-5 passes to catch hallucinations
3. **Remove alpha**: Some models struggle with transparency 3. **200 DPI is optimal**: Higher doesn't help, lower loses detail
4. **Multi-page**: Pass all pages in single request for context 4. **PNG over JPEG**: Preserves text clarity
5. **Temperature 0.1**: Low temperature for consistent output 5. **Temperature 0.1**: Low temperature for consistent output
6. **European format**: Explicitly explain comma=decimal in prompt 6. **Multi-page support**: Pass all pages in single request for context
7. **Normalize for comparison**: Ignore case/whitespace when comparing invoice numbers
## Common Issues
| Issue | Cause | Solution |
|-------|-------|----------|
| Wrong date | Multiple dates on invoice | Be specific in prompt about which date |
| Wrong currency | Symbol vs code mismatch | OCR helps disambiguate |
| Missing digits | Low resolution | Increase density to 300 DPI |
| Hallucinated data | VLM uncertainty | Use consensus voting |

View File

@@ -0,0 +1,535 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
// Service URLs
const OLLAMA_URL = 'http://localhost:11434';
const PADDLEOCR_VL_URL = 'http://localhost:8000';
// Models
const MINICPM_MODEL = 'openbmb/minicpm-v4.5:q8_0';
const PADDLEOCR_VL_MODEL = 'paddleocr-vl';
// Prompt for MiniCPM-V visual extraction
const MINICPM_EXTRACT_PROMPT = `/nothink
You are a bank statement parser. Extract EVERY transaction from the table.
Read the Amount column carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any rows. Return ONLY the JSON array, no explanation.`;
// Prompt for PaddleOCR-VL table extraction
const PADDLEOCR_VL_TABLE_PROMPT = `Table Recognition:`;
// Post-processing prompt to convert PaddleOCR-VL output to JSON
const PADDLEOCR_VL_CONVERT_PROMPT = `/nothink
Convert the following bank statement table data to JSON.
Read the Amount values carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point
For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Return ONLY the JSON array, no explanation.
Table data:
---
{TABLE_DATA}
---`;
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Extract using MiniCPM-V via Ollama
*/
async function extractWithMiniCPM(images: string[], passLabel: string): Promise<ITransaction[]> {
const payload = {
model: MINICPM_MODEL,
prompt: MINICPM_EXTRACT_PROMPT,
images,
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let fullText = '';
let lineBuffer = '';
console.log(`[${passLabel}] Extracting with MiniCPM-V...`);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const json = JSON.parse(line);
if (json.response) {
fullText += json.response;
lineBuffer += json.response;
if (lineBuffer.includes('\n')) {
const parts = lineBuffer.split('\n');
for (let i = 0; i < parts.length - 1; i++) {
console.log(parts[i]);
}
lineBuffer = parts[parts.length - 1];
}
}
} catch {
// Skip invalid JSON lines
}
}
}
if (lineBuffer) {
console.log(lineBuffer);
}
console.log('');
const startIdx = fullText.indexOf('[');
const endIdx = fullText.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error('No JSON array found in response');
}
return JSON.parse(fullText.substring(startIdx, endIdx));
}
/**
* Extract table using PaddleOCR-VL via OpenAI-compatible API
*/
async function extractTableWithPaddleOCRVL(imageBase64: string): Promise<string> {
const payload = {
model: PADDLEOCR_VL_MODEL,
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: `data:image/png;base64,${imageBase64}` },
},
{
type: 'text',
text: PADDLEOCR_VL_TABLE_PROMPT,
},
],
},
],
temperature: 0.0,
max_tokens: 8192,
};
const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
const text = await response.text();
throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
}
const data = await response.json();
return data.choices?.[0]?.message?.content || '';
}
/**
* Convert PaddleOCR-VL table output to transactions using MiniCPM-V
*/
async function convertTableToTransactions(
tableData: string,
passLabel: string
): Promise<ITransaction[]> {
const prompt = PADDLEOCR_VL_CONVERT_PROMPT.replace('{TABLE_DATA}', tableData);
const payload = {
model: MINICPM_MODEL,
prompt,
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let fullText = '';
console.log(`[${passLabel}] Converting table data to JSON...`);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const json = JSON.parse(line);
if (json.response) {
fullText += json.response;
}
} catch {
// Skip invalid JSON lines
}
}
}
const startIdx = fullText.indexOf('[');
const endIdx = fullText.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error('No JSON array found in response');
}
return JSON.parse(fullText.substring(startIdx, endIdx));
}
/**
* Extract using PaddleOCR-VL (table recognition) + conversion
*/
async function extractWithPaddleOCRVL(
images: string[],
passLabel: string
): Promise<ITransaction[]> {
console.log(`[${passLabel}] Extracting tables with PaddleOCR-VL...`);
// Extract table data from each page
const tableDataParts: string[] = [];
for (let i = 0; i < images.length; i++) {
console.log(`[${passLabel}] Processing page ${i + 1}/${images.length}...`);
const tableData = await extractTableWithPaddleOCRVL(images[i]);
if (tableData.trim()) {
tableDataParts.push(`--- Page ${i + 1} ---\n${tableData}`);
}
}
const combinedTableData = tableDataParts.join('\n\n');
console.log(`[${passLabel}] Got ${combinedTableData.length} chars of table data`);
// Convert to transactions
return convertTableToTransactions(combinedTableData, passLabel);
}
/**
* Create a hash of transactions for comparison
*/
function hashTransactions(transactions: ITransaction[]): string {
return transactions
.map((t) => `${t.date}|${t.amount.toFixed(2)}`)
.sort()
.join(';');
}
/**
* Check if PaddleOCR-VL service is available
*/
async function isPaddleOCRVLAvailable(): Promise<boolean> {
try {
const response = await fetch(`${PADDLEOCR_VL_URL}/health`, {
method: 'GET',
signal: AbortSignal.timeout(5000),
});
return response.ok;
} catch {
return false;
}
}
/**
* Extract with dual-VLM consensus
* Strategy:
* Pass 1 = MiniCPM-V visual extraction
* Pass 2 = PaddleOCR-VL table recognition (if available)
* Pass 3+ = MiniCPM-V visual (fallback)
*/
async function extractWithConsensus(
images: string[],
maxPasses: number = 5
): Promise<ITransaction[]> {
const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
const hashCounts: Map<string, number> = new Map();
const addResult = (transactions: ITransaction[], passLabel: string): number => {
const hash = hashTransactions(transactions);
results.push({ transactions, hash });
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(
`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`
);
return hashCounts.get(hash)!;
};
// Check if PaddleOCR-VL is available
const paddleOCRVLAvailable = await isPaddleOCRVLAvailable();
if (paddleOCRVLAvailable) {
console.log('[Setup] PaddleOCR-VL service available - using dual-VLM consensus');
} else {
console.log('[Setup] PaddleOCR-VL not available - using MiniCPM-V only');
}
// Pass 1: MiniCPM-V visual extraction
try {
const pass1Result = await extractWithMiniCPM(images, 'Pass 1 MiniCPM-V');
addResult(pass1Result, 'Pass 1 MiniCPM-V');
} catch (err) {
console.log(`[Pass 1] Error: ${err}`);
}
// Pass 2: PaddleOCR-VL table recognition (if available)
if (paddleOCRVLAvailable) {
try {
const pass2Result = await extractWithPaddleOCRVL(images, 'Pass 2 PaddleOCR-VL');
const count = addResult(pass2Result, 'Pass 2 PaddleOCR-VL');
if (count >= 2) {
console.log('[Consensus] MiniCPM-V and PaddleOCR-VL extractions match!');
return pass2Result;
}
} catch (err) {
console.log(`[Pass 2 PaddleOCR-VL] Error: ${err}`);
}
}
// Pass 3+: Continue with MiniCPM-V visual passes
const startPass = paddleOCRVLAvailable ? 3 : 2;
for (let pass = startPass; pass <= maxPasses; pass++) {
try {
const transactions = await extractWithMiniCPM(images, `Pass ${pass} MiniCPM-V`);
const count = addResult(transactions, `Pass ${pass} MiniCPM-V`);
if (count >= 2) {
console.log(`[Consensus] Reached after ${pass} passes`);
return transactions;
}
console.log(`[Pass ${pass}] No consensus yet, trying again...`);
} catch (err) {
console.log(`[Pass ${pass}] Error: ${err}`);
}
}
// No consensus reached - return the most common result
let bestHash = '';
let bestCount = 0;
for (const [hash, count] of hashCounts) {
if (count > bestCount) {
bestCount = count;
bestHash = hash;
}
}
if (!bestHash) {
throw new Error('No valid results obtained');
}
const best = results.find((r) => r.hash === bestHash)!;
console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
return best.transactions;
}
/**
* Compare extracted transactions against expected
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[] } {
const errors: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
} else {
errors.push(
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors };
}
/**
* Find all test cases (PDF + JSON pairs) in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) {
return [];
}
const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f: string) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases;
}
// Tests
tap.test('should connect to Ollama API', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
expect(response.ok).toBeTrue();
const data = await response.json();
expect(data.models).toBeArray();
});
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
const data = await response.json();
const modelNames = data.models.map((m: { name: string }) => m.name);
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
});
tap.test('should check PaddleOCR-VL availability', async () => {
const available = await isPaddleOCRVLAvailable();
console.log(`PaddleOCR-VL available: ${available}`);
// This test passes regardless - PaddleOCR-VL is optional
expect(true).toBeTrue();
});
// Dynamic test for each PDF/JSON pair
const testCases = findTestCases();
for (const testCase of testCases) {
tap.test(`should extract transactions from ${testCase.name}`, async () => {
// Load expected transactions
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
// Convert PDF to images
console.log('Converting PDF to images...');
const images = convertPdfToImages(testCase.pdfPath);
console.log(`Converted: ${images.length} pages\n`);
// Extract with dual-VLM consensus
const extracted = await extractWithConsensus(images);
console.log(`\nFinal: ${extracted.length} transactions`);
// Compare results
const result = compareTransactions(extracted, expected);
console.log(`Accuracy: ${result.matches}/${result.total}`);
if (result.errors.length > 0) {
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
}
// Assert high accuracy
const accuracy = result.matches / result.total;
expect(accuracy).toBeGreaterThan(0.95);
expect(extracted.length).toEqual(expected.length);
});
}
export default tap.start();

View File

@@ -6,7 +6,7 @@ import * as os from 'os';
const OLLAMA_URL = 'http://localhost:11434'; const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0'; const MODEL = 'openbmb/minicpm-v4.5:q8_0';
const PADDLEOCR_URL = 'http://localhost:5000'; const PADDLEOCR_VL_URL = 'http://localhost:8000';
interface IInvoice { interface IInvoice {
invoice_number: string; invoice_number: string;
@@ -19,24 +19,33 @@ interface IInvoice {
} }
/** /**
* Extract OCR text from an image using PaddleOCR * Extract OCR text from an image using PaddleOCR-VL (OpenAI-compatible API)
*/ */
async function extractOcrText(imageBase64: string): Promise<string> { async function extractOcrText(imageBase64: string): Promise<string> {
try { try {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, { const response = await fetch(`${PADDLEOCR_VL_URL}/v1/chat/completions`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }), body: JSON.stringify({
model: 'paddleocr-vl',
messages: [{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
{ type: 'text', text: 'OCR:' }
]
}],
temperature: 0.0,
max_tokens: 4096
}),
}); });
if (!response.ok) return ''; if (!response.ok) return '';
const data = await response.json(); const data = await response.json();
if (data.success && data.results) { return data.choices?.[0]?.message?.content || '';
return data.results.map((r: { text: string }) => r.text).join('\n');
}
} catch { } catch {
// PaddleOCR unavailable // PaddleOCR-VL unavailable
} }
return ''; return '';
} }
@@ -45,7 +54,8 @@ async function extractOcrText(imageBase64: string): Promise<string> {
* Build prompt with optional OCR text * Build prompt with optional OCR text
*/ */
function buildPrompt(ocrText: string): string { function buildPrompt(ocrText: string): string {
const base = `You are an invoice parser. Extract the following fields from this invoice: const base = `/nothink
You are an invoice parser. Extract the following fields from this invoice:
1. invoice_number: The invoice/receipt number 1. invoice_number: The invoice/receipt number
2. invoice_date: Date in YYYY-MM-DD format 2. invoice_date: Date in YYYY-MM-DD format
@@ -62,11 +72,17 @@ If a field is not visible, use null for strings or 0 for numbers.
No explanation, just the JSON object.`; No explanation, just the JSON object.`;
if (ocrText) { if (ocrText) {
// Limit OCR text to prevent context overflow
const maxOcrLength = 4000;
const truncatedOcr = ocrText.length > maxOcrLength
? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
: ocrText;
return `${base} return `${base}
OCR text extracted from the invoice: OCR text extracted from the invoice (use for reference):
--- ---
${ocrText} ${truncatedOcr}
--- ---
Cross-reference the image with the OCR text above for accuracy.`; Cross-reference the image with the OCR text above for accuracy.`;

View File

@@ -1,305 +0,0 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
const EXTRACT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
Read the Amount column carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any rows. Return ONLY the JSON array, no explanation.`;
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Single extraction pass
*/
async function extractOnce(images: string[], passNum: number): Promise<ITransaction[]> {
const payload = {
model: MODEL,
prompt: EXTRACT_PROMPT,
images,
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let fullText = '';
let lineBuffer = '';
console.log(`[Pass ${passNum}] Extracting...`);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const json = JSON.parse(line);
if (json.response) {
fullText += json.response;
lineBuffer += json.response;
// Print complete lines
if (lineBuffer.includes('\n')) {
const parts = lineBuffer.split('\n');
for (let i = 0; i < parts.length - 1; i++) {
console.log(parts[i]);
}
lineBuffer = parts[parts.length - 1];
}
}
} catch {
// Skip invalid JSON lines
}
}
}
if (lineBuffer) {
console.log(lineBuffer);
}
console.log('');
const startIdx = fullText.indexOf('[');
const endIdx = fullText.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error('No JSON array found in response');
}
return JSON.parse(fullText.substring(startIdx, endIdx));
}
/**
* Create a hash of transactions for comparison
*/
function hashTransactions(transactions: ITransaction[]): string {
return transactions
.map((t) => `${t.date}|${t.amount.toFixed(2)}`)
.sort()
.join(';');
}
/**
* Extract with majority voting - run until 2 passes match
*/
async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
const hashCounts: Map<string, number> = new Map();
for (let pass = 1; pass <= maxPasses; pass++) {
const transactions = await extractOnce(images, pass);
const hash = hashTransactions(transactions);
results.push({ transactions, hash });
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(`[Pass ${pass}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
// Check if we have consensus (2+ matching)
const count = hashCounts.get(hash)!;
if (count >= 2) {
console.log(`[Consensus] Reached after ${pass} passes (${count} matching results)`);
return transactions;
}
// After 2 passes, if no match yet, continue
if (pass >= 2) {
console.log(`[Pass ${pass}] No consensus yet, trying again...`);
}
}
// No consensus reached - return the most common result
let bestHash = '';
let bestCount = 0;
for (const [hash, count] of hashCounts) {
if (count > bestCount) {
bestCount = count;
bestHash = hash;
}
}
const best = results.find((r) => r.hash === bestHash)!;
console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
return best.transactions;
}
/**
* Compare extracted transactions against expected
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[] } {
const errors: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
} else {
errors.push(
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors };
}
/**
* Find all test cases (PDF + JSON pairs) in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) {
return [];
}
const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases;
}
// Tests
tap.test('should connect to Ollama API', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
expect(response.ok).toBeTrue();
const data = await response.json();
expect(data.models).toBeArray();
});
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
const data = await response.json();
const modelNames = data.models.map((m: { name: string }) => m.name);
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
});
// Dynamic test for each PDF/JSON pair
const testCases = findTestCases();
for (const testCase of testCases) {
tap.test(`should extract transactions from ${testCase.name}`, async () => {
// Load expected transactions
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
// Convert PDF to images
console.log('Converting PDF to images...');
const images = convertPdfToImages(testCase.pdfPath);
console.log(`Converted: ${images.length} pages\n`);
// Extract with consensus voting
const extracted = await extractWithConsensus(images);
console.log(`\nFinal: ${extracted.length} transactions`);
// Compare results
const result = compareTransactions(extracted, expected);
console.log(`Accuracy: ${result.matches}/${result.total}`);
if (result.errors.length > 0) {
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
}
// Assert high accuracy
const accuracy = result.matches / result.total;
expect(accuracy).toBeGreaterThan(0.95);
expect(extracted.length).toEqual(expected.length);
});
}
export default tap.start();

View File

@@ -1,258 +0,0 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
const PADDLEOCR_URL = 'http://localhost:5000';
interface IOCRResult {
text: string;
confidence: number;
box: number[][];
}
interface IOCRResponse {
success: boolean;
results: IOCRResult[];
error?: string;
}
interface IHealthResponse {
status: string;
model: string;
language: string;
gpu_enabled: boolean;
}
/**
* Convert PDF first page to PNG using ImageMagick
*/
function convertPdfToImage(pdfPath: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPath = path.join(tempDir, 'page.png');
try {
execSync(
`convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Create a simple test image with text using ImageMagick
*/
function createTestImage(text: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
const outputPath = path.join(tempDir, 'test.png');
try {
execSync(
`convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
// Health check test
tap.test('should respond to health check', async () => {
const response = await fetch(`${PADDLEOCR_URL}/health`);
expect(response.ok).toBeTrue();
const data: IHealthResponse = await response.json();
expect(data.status).toEqual('healthy');
expect(data.model).toEqual('PP-OCRv4');
expect(data.language).toBeTypeofString();
expect(data.gpu_enabled).toBeTypeofBoolean();
console.log(`PaddleOCR Status: ${data.status}`);
console.log(` Model: ${data.model}`);
console.log(` Language: ${data.language}`);
console.log(` GPU Enabled: ${data.gpu_enabled}`);
});
// Base64 OCR test
tap.test('should perform OCR on base64 image', async () => {
// Create a test image with known text
const testText = 'Hello World 12345';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
// Check that at least some of the expected text was found
const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
const hasPartialMatch =
normalizedExtracted.includes('hello') ||
normalizedExtracted.includes('world') ||
normalizedExtracted.includes('12345');
expect(hasPartialMatch).toBeTrue();
});
// File upload OCR test
tap.test('should perform OCR via file upload', async () => {
const testText = 'Invoice Number 98765';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const imageBuffer = Buffer.from(imageBase64, 'base64');
const formData = new FormData();
const blob = new Blob([imageBuffer], { type: 'image/png' });
formData.append('img', blob, 'test.png');
const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
method: 'POST',
body: formData,
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
});
// OCR result structure test
tap.test('should return proper OCR result structure', async () => {
const testText = 'Test 123';
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const data: IOCRResponse = await response.json();
if (data.results.length > 0) {
const result = data.results[0];
// Check result has required fields
expect(result.text).toBeTypeofString();
expect(result.confidence).toBeTypeofNumber();
expect(result.box).toBeArray();
// Check bounding box structure (4 points, each with x,y)
expect(result.box.length).toEqual(4);
for (const point of result.box) {
expect(point.length).toEqual(2);
expect(point[0]).toBeTypeofNumber();
expect(point[1]).toBeTypeofNumber();
}
// Confidence should be between 0 and 1
expect(result.confidence).toBeGreaterThan(0);
expect(result.confidence).toBeLessThanOrEqual(1);
console.log(`Result structure valid:`);
console.log(` Text: "${result.text}"`);
console.log(` Confidence: ${(result.confidence * 100).toFixed(1)}%`);
console.log(` Box: ${JSON.stringify(result.box)}`);
}
});
// Test with actual invoice if available
const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
if (fs.existsSync(invoiceDir)) {
const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
if (pdfFiles.length > 0) {
const testPdf = pdfFiles[0];
tap.test(`should extract text from invoice: ${testPdf}`, async () => {
const pdfPath = path.join(invoiceDir, testPdf);
console.log(`Converting ${testPdf} to image...`);
const imageBase64 = convertPdfToImage(pdfPath);
console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
const startTime = Date.now();
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const endTime = Date.now();
const elapsedMs = endTime - startTime;
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
console.log(`Found ${data.results.length} text regions`);
// Print first 10 results
const preview = data.results.slice(0, 10);
console.log(`\nFirst ${preview.length} results:`);
for (const result of preview) {
console.log(` [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
}
if (data.results.length > 10) {
console.log(` ... and ${data.results.length - 10} more`);
}
// Should find text in an invoice
expect(data.results.length).toBeGreaterThan(5);
});
}
}
// Error handling test
tap.test('should handle invalid base64 gracefully', async () => {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: 'not-valid-base64!!!' }),
});
const data: IOCRResponse = await response.json();
// Should return success: false with error message
expect(data.success).toBeFalse();
expect(data.error).toBeTypeofString();
console.log(`Error handling works: ${data.error}`);
});
export default tap.start();