4 Commits

Author SHA1 Message Date
acded2a165 v1.3.0
Some checks failed
Docker (tags) / security (push) Successful in 30s
Docker (tags) / test (push) Failing after 41s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-16 13:23:01 +00:00
bec379e9ca feat(paddleocr): add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows 2026-01-16 13:23:01 +00:00
67c38eeb67 v1.2.0 2026-01-16 10:23:32 +00:00
ae4bb26931 feat(paddleocr): add PaddleOCR support: Docker images, FastAPI server, entrypoint and tests 2026-01-16 10:23:32 +00:00
14 changed files with 737 additions and 84 deletions

View File

@@ -0,0 +1,67 @@
name: Docker (no tags)
on:
push:
tags-ignore:
- '**'
env:
IMAGE: code.foss.global/host.today/ht-docker-node:npmci
NPMCI_COMPUTED_REPOURL: https://${{gitea.repository_owner}}:${{secrets.GITEA_TOKEN}}@gitea.lossless.digital/${{gitea.repository}}.git
NPMCI_LOGIN_DOCKER_DOCKERREGISTRY: ${{ secrets.NPMCI_LOGIN_DOCKER_DOCKERREGISTRY }}
jobs:
security:
runs-on: ubuntu-latest
container:
image: ${{ env.IMAGE }}
continue-on-error: true
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
pnpm install -g pnpm
pnpm install -g @ship.zone/npmci
npmci npm prepare
- name: Audit production dependencies
run: |
npmci command npm config set registry https://registry.npmjs.org
npmci command pnpm audit --audit-level=high --prod
continue-on-error: true
- name: Audit development dependencies
run: |
npmci command npm config set registry https://registry.npmjs.org
npmci command pnpm audit --audit-level=high --dev
continue-on-error: true
test:
needs: security
runs-on: ubuntu-latest
container:
image: ${{ env.IMAGE }}
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
pnpm install -g pnpm
pnpm install -g @ship.zone/npmci
npmci npm prepare
- name: Test stable
run: |
npmci node install stable
npmci npm install
npmci npm test
continue-on-error: true
- name: Test build
run: |
npmci node install stable
npmci npm install
npmci command npm run build

View File

@@ -0,0 +1,101 @@
name: Docker (tags)
on:
push:
tags:
- '*'
env:
IMAGE: code.foss.global/host.today/ht-docker-node:npmci
NPMCI_COMPUTED_REPOURL: https://${{gitea.repository_owner}}:${{secrets.GITEA_TOKEN}}@gitea.lossless.digital/${{gitea.repository}}.git
NPMCI_LOGIN_DOCKER_DOCKERREGISTRY: ${{ secrets.NPMCI_LOGIN_DOCKER_DOCKERREGISTRY }}
jobs:
security:
runs-on: ubuntu-latest
container:
image: ${{ env.IMAGE }}
continue-on-error: true
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
pnpm install -g pnpm
pnpm install -g @ship.zone/npmci
npmci npm prepare
- name: Audit production dependencies
run: |
npmci command npm config set registry https://registry.npmjs.org
npmci command pnpm audit --audit-level=high --prod
continue-on-error: true
- name: Audit development dependencies
run: |
npmci command npm config set registry https://registry.npmjs.org
npmci command pnpm audit --audit-level=high --dev
continue-on-error: true
test:
needs: security
runs-on: ubuntu-latest
container:
image: ${{ env.IMAGE }}
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
pnpm install -g pnpm
pnpm install -g @ship.zone/npmci
npmci npm prepare
- name: Test stable
run: |
npmci node install stable
npmci npm install
npmci npm test
continue-on-error: true
- name: Test build
run: |
npmci node install stable
npmci npm install
npmci command npm run build
release:
needs: test
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
runs-on: ubuntu-latest
container:
image: code.foss.global/host.today/ht-docker-dbase:npmci
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
pnpm install -g pnpm
pnpm install -g @ship.zone/npmci
- name: Release
run: |
npmci docker login
npmci docker build
npmci docker push code.foss.global
metadata:
needs: test
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
runs-on: ubuntu-latest
container:
image: ${{ env.IMAGE }}
steps:
- uses: actions/checkout@v3
- name: Trigger
run: npmci trigger

View File

@@ -1,6 +1,6 @@
# PaddleOCR GPU Variant # PaddleOCR GPU Variant
# OCR processing with NVIDIA GPU support using PaddlePaddle # OCR processing with NVIDIA GPU support using PaddlePaddle
FROM paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>" LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - GPU optimized" LABEL description="PaddleOCR PP-OCRv4 - GPU optimized"
@@ -22,9 +22,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
curl \ curl \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install Python dependencies # Install Python dependencies (using stable paddleocr 2.x)
RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir \
paddleocr \ paddleocr==2.8.1 \
fastapi \ fastapi \
uvicorn[standard] \ uvicorn[standard] \
python-multipart \ python-multipart \
@@ -32,14 +32,12 @@ RUN pip install --no-cache-dir \
pillow pillow
# Copy server files # Copy server files
COPY image_support_files/paddleocr-server.py /app/paddleocr-server.py COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Pre-download OCR models during build (PP-OCRv4) # Note: OCR models will be downloaded on first run
RUN python -c "from paddleocr import PaddleOCR; \ # This ensures compatibility across different GPU architectures
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=True); \
print('English model downloaded')"
# Expose API port # Expose API port
EXPOSE 5000 EXPOSE 5000

View File

@@ -1,6 +1,6 @@
# PaddleOCR CPU Variant # PaddleOCR CPU Variant
# OCR processing optimized for CPU-only inference # OCR processing optimized for CPU-only inference
FROM python:3.10-slim FROM python:3.10-slim-bookworm
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>" LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="PaddleOCR PP-OCRv4 - CPU optimized" LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
@@ -21,13 +21,14 @@ WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1-mesa-glx \ libgl1-mesa-glx \
libglib2.0-0 \ libglib2.0-0 \
libgomp1 \
curl \ curl \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install Python dependencies (CPU version of PaddlePaddle) # Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir \
paddlepaddle \ paddlepaddle==2.6.2 \
paddleocr \ paddleocr==2.8.1 \
fastapi \ fastapi \
uvicorn[standard] \ uvicorn[standard] \
python-multipart \ python-multipart \
@@ -35,14 +36,12 @@ RUN pip install --no-cache-dir \
pillow pillow
# Copy server files # Copy server files
COPY image_support_files/paddleocr-server.py /app/paddleocr-server.py COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh
# Pre-download OCR models during build (PP-OCRv4) # Note: OCR models will be downloaded on first run
RUN python -c "from paddleocr import PaddleOCR; \ # This avoids build-time segfaults with certain CPU architectures
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=True); \
print('English model downloaded')"
# Expose API port # Expose API port
EXPOSE 5000 EXPOSE 5000

View File

@@ -29,9 +29,30 @@ docker build \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \ -t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
. .
# Build PaddleOCR GPU variant
echo -e "${GREEN}Building PaddleOCR GPU variant...${NC}"
docker build \
-f Dockerfile_paddleocr \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu \
.
# Build PaddleOCR CPU variant
echo -e "${GREEN}Building PaddleOCR CPU variant...${NC}"
docker build \
-f Dockerfile_paddleocr_cpu \
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu \
.
echo -e "${GREEN}All images built successfully!${NC}" echo -e "${GREEN}All images built successfully!${NC}"
echo "" echo ""
echo "Available images:" echo "Available images:"
echo " MiniCPM-V 4.5:"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v (GPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu (CPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)" echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest (GPU)"
echo ""
echo " PaddleOCR:"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr (GPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-gpu (GPU)"
echo " - ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-cpu (CPU)"

View File

@@ -1,5 +1,23 @@
# Changelog # Changelog
## 2026-01-16 - 1.3.0 - feat(paddleocr)
add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows
- Add GPU and CPU PaddleOCR Dockerfiles; pin paddlepaddle/paddle and paddleocr to stable 2.x and install libgomp1 for CPU builds
- Avoid pre-downloading OCR models at build-time to prevent build-time segfaults; models are downloaded on first run
- Refactor PaddleOCR FastAPI server: respect CUDA_VISIBLE_DEVICES, support per-request language, cache default language instance and create temporary instances for other languages
- Add comprehensive tests (test.paddleocr.ts) and improve invoice extraction tests (parallelize passes, JSON OCR API usage, prioritize certain test cases)
- Add Gitea CI workflows for tag and non-tag Docker runs and release pipeline (docker build/push, metadata trigger)
- Update documentation (readme.hints.md) with PaddleOCR usage and add docker registry entry to npmextra.json
## 2026-01-16 - 1.2.0 - feat(paddleocr)
add PaddleOCR support: Docker images, FastAPI server, entrypoint and tests
- Add PaddleOCR FastAPI server implementation at image_support_files/paddleocr_server.py
- Remove old image_support_files/paddleocr-server.py and update entrypoint to import paddleocr_server:app
- Extend build-images.sh to build paddleocr (GPU) and paddleocr-cpu images and list them
- Extend test-images.sh to add paddleocr health/OCR tests, new test_paddleocr_image function, port config, and cleanup; rename test_image -> test_minicpm_image
## 2026-01-16 - 1.1.0 - feat(ocr) ## 2026-01-16 - 1.1.0 - feat(ocr)
add PaddleOCR GPU Docker image and FastAPI OCR server with entrypoint; implement OCR endpoints and consensus extraction testing add PaddleOCR GPU Docker image and FastAPI OCR server with entrypoint; implement OCR endpoints and consensus extraction testing

View File

@@ -19,7 +19,7 @@ else
fi fi
# Start the FastAPI server with uvicorn # Start the FastAPI server with uvicorn
exec python -m uvicorn paddleocr-server:app \ exec python -m uvicorn paddleocr_server:app \
--host "${SERVER_HOST}" \ --host "${SERVER_HOST}" \
--port "${SERVER_PORT}" \ --port "${SERVER_PORT}" \
--workers 1 --workers 1

View File

@@ -26,6 +26,7 @@ logger = logging.getLogger(__name__)
# Environment configuration # Environment configuration
OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en') OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en')
# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable
USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1' USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1'
# Initialize FastAPI app # Initialize FastAPI app
@@ -72,19 +73,29 @@ class HealthResponse(BaseModel):
gpu_enabled: bool gpu_enabled: bool
def get_ocr() -> PaddleOCR: def get_ocr(lang: Optional[str] = None) -> PaddleOCR:
"""Get or initialize the OCR instance""" """Get or initialize the OCR instance"""
global ocr_instance global ocr_instance
if ocr_instance is None: use_lang = lang or OCR_LANGUAGE
logger.info(f"Initializing PaddleOCR with language={OCR_LANGUAGE}, use_gpu={USE_GPU}")
ocr_instance = PaddleOCR( # Return cached instance if same language
use_angle_cls=True, if ocr_instance is not None and lang is None:
lang=OCR_LANGUAGE, return ocr_instance
use_gpu=USE_GPU,
show_log=False logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}")
) new_ocr = PaddleOCR(
logger.info("PaddleOCR initialized successfully") use_angle_cls=True,
return ocr_instance lang=use_lang,
use_gpu=USE_GPU,
show_log=False
)
# Cache the default language instance
if lang is None:
ocr_instance = new_ocr
logger.info("PaddleOCR initialized successfully")
return new_ocr
def decode_base64_image(base64_string: str) -> np.ndarray: def decode_base64_image(base64_string: str) -> np.ndarray:
@@ -176,20 +187,12 @@ async def ocr_base64(request: OCRRequest):
image = decode_base64_image(request.image) image = decode_base64_image(request.image)
# Get OCR instance (use request language if provided) # Get OCR instance (use request language if provided)
ocr = get_ocr()
# If a different language is requested, create a new instance
if request.language and request.language != OCR_LANGUAGE: if request.language and request.language != OCR_LANGUAGE:
logger.info(f"Creating OCR instance for language: {request.language}") ocr = get_ocr(request.language)
temp_ocr = PaddleOCR(
use_angle_cls=True,
lang=request.language,
use_gpu=USE_GPU,
show_log=False
)
result = temp_ocr.ocr(image, cls=True)
else: else:
result = ocr.ocr(image, cls=True) ocr = get_ocr()
result = ocr.ocr(image, cls=True)
# Process results # Process results
results = process_ocr_result(result) results = process_ocr_result(result)
@@ -228,20 +231,12 @@ async def ocr_upload(
image_array = np.array(image) image_array = np.array(image)
# Get OCR instance # Get OCR instance
ocr = get_ocr()
# If a different language is requested, create a new instance
if language and language != OCR_LANGUAGE: if language and language != OCR_LANGUAGE:
logger.info(f"Creating OCR instance for language: {language}") ocr = get_ocr(language)
temp_ocr = PaddleOCR(
use_angle_cls=True,
lang=language,
use_gpu=USE_GPU,
show_log=False
)
result = temp_ocr.ocr(image_array, cls=True)
else: else:
result = ocr.ocr(image_array, cls=True) ocr = get_ocr()
result = ocr.ocr(image_array, cls=True)
# Process results # Process results
results = process_ocr_result(result) results = process_ocr_result(result)

View File

@@ -1,7 +1,10 @@
{ {
"npmci": { "npmci": {
"npmGlobalTools": [], "npmGlobalTools": [],
"npmAccessLevel": "public" "npmAccessLevel": "public",
"dockerRegistries": [
"code.foss.global"
]
}, },
"gitzone": { "gitzone": {
"projectType": "docker", "projectType": "docker",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@host.today/ht-docker-ai", "name": "@host.today/ht-docker-ai",
"version": "1.1.0", "version": "1.3.0",
"type": "module", "type": "module",
"private": false, "private": false,
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5", "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",

View File

@@ -77,6 +77,81 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CPU variant has longer `start-period` (120s) due to slower startup. CPU variant has longer `start-period` (120s) due to slower startup.
## PaddleOCR
### Overview
PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides:
- Text detection and recognition
- Multi-language support
- FastAPI REST API
- GPU and CPU variants
### Docker Images
| Tag | Description |
|-----|-------------|
| `paddleocr` | GPU variant (default) |
| `paddleocr-gpu` | GPU variant (alias) |
| `paddleocr-cpu` | CPU-only variant |
### API Endpoints
| Endpoint | Method | Description |
|----------|--------|-------------|
| `/health` | GET | Health check with model info |
| `/ocr` | POST | OCR with base64 image (JSON body) |
| `/ocr/upload` | POST | OCR with file upload (multipart form) |
### Request/Response Format
**POST /ocr (JSON)**
```json
{
"image": "<base64-encoded-image>",
"language": "en" // optional
}
```
**POST /ocr/upload (multipart)**
- `img`: image file
- `language`: optional language code
**Response**
```json
{
"success": true,
"results": [
{
"text": "Invoice #12345",
"confidence": 0.98,
"box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
}
]
}
```
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `OCR_LANGUAGE` | `en` | Default language for OCR |
| `SERVER_PORT` | `5000` | Server port |
| `SERVER_HOST` | `0.0.0.0` | Server host |
| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only |
### Performance
- **GPU**: ~1-3 seconds per page
- **CPU**: ~10-30 seconds per page
### Supported Languages
Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean)
---
## Adding New Models ## Adding New Models
To add a new model variant: To add a new model variant:

View File

@@ -5,7 +5,8 @@ set -e
REGISTRY="code.foss.global" REGISTRY="code.foss.global"
NAMESPACE="host.today" NAMESPACE="host.today"
IMAGE_NAME="ht-docker-ai" IMAGE_NAME="ht-docker-ai"
TEST_PORT=11434 MINICPM_PORT=11434
PADDLEOCR_PORT=5000
# Colors for output # Colors for output
GREEN='\033[0;32m' GREEN='\033[0;32m'
@@ -17,11 +18,13 @@ cleanup() {
echo -e "${BLUE}Cleaning up test containers...${NC}" echo -e "${BLUE}Cleaning up test containers...${NC}"
docker rm -f test-minicpm-gpu 2>/dev/null || true docker rm -f test-minicpm-gpu 2>/dev/null || true
docker rm -f test-minicpm-cpu 2>/dev/null || true docker rm -f test-minicpm-cpu 2>/dev/null || true
docker rm -f test-paddleocr-gpu 2>/dev/null || true
docker rm -f test-paddleocr-cpu 2>/dev/null || true
} }
trap cleanup EXIT trap cleanup EXIT
test_image() { test_minicpm_image() {
local tag=$1 local tag=$1
local container_name=$2 local container_name=$2
local extra_args=$3 local extra_args=$3
@@ -31,7 +34,7 @@ test_image() {
# Start container # Start container
docker run -d \ docker run -d \
--name ${container_name} \ --name ${container_name} \
-p ${TEST_PORT}:11434 \ -p ${MINICPM_PORT}:11434 \
${extra_args} \ ${extra_args} \
${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:${tag} ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:${tag}
@@ -41,7 +44,7 @@ test_image() {
# Test API endpoint # Test API endpoint
echo "Testing API endpoint..." echo "Testing API endpoint..."
if curl -s -f http://localhost:${TEST_PORT}/api/tags > /dev/null; then if curl -s -f http://localhost:${MINICPM_PORT}/api/tags > /dev/null; then
echo -e "${GREEN}API endpoint responding!${NC}" echo -e "${GREEN}API endpoint responding!${NC}"
else else
echo -e "${RED}API endpoint not responding!${NC}" echo -e "${RED}API endpoint not responding!${NC}"
@@ -56,17 +59,85 @@ test_image() {
echo "" echo ""
} }
test_paddleocr_image() {
local tag=$1
local container_name=$2
local extra_args=$3
echo -e "${BLUE}Testing ${tag}...${NC}"
# Start container
docker run -d \
--name ${container_name} \
-p ${PADDLEOCR_PORT}:5000 \
${extra_args} \
${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:${tag}
# Wait for startup (PaddleOCR takes longer to initialize)
echo "Waiting for container to start..."
sleep 30
# Test health endpoint
echo "Testing health endpoint..."
if curl -s -f http://localhost:${PADDLEOCR_PORT}/health > /dev/null; then
echo -e "${GREEN}Health endpoint responding!${NC}"
else
echo -e "${RED}Health endpoint not responding!${NC}"
docker logs ${container_name}
return 1
fi
# Test OCR endpoint with a minimal base64 image (1x1 white pixel PNG)
echo "Testing OCR endpoint..."
local test_image="iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
local response=$(curl -s -X POST http://localhost:${PADDLEOCR_PORT}/ocr \
-H "Content-Type: application/json" \
-d "{\"image\": \"${test_image}\"}")
if echo "$response" | grep -q '"success"'; then
echo -e "${GREEN}OCR endpoint responding!${NC}"
else
echo -e "${RED}OCR endpoint not responding correctly!${NC}"
echo "Response: $response"
docker logs ${container_name}
return 1
fi
# Cleanup this container
docker rm -f ${container_name}
echo -e "${GREEN}${tag} test passed!${NC}"
echo ""
}
echo -e "${BLUE}=== Testing ht-docker-ai images ===${NC}" echo -e "${BLUE}=== Testing ht-docker-ai images ===${NC}"
echo "" echo ""
# Test CPU variant (doesn't require GPU) echo -e "${BLUE}--- MiniCPM-V Tests ---${NC}"
test_image "minicpm45v-cpu" "test-minicpm-cpu" "" echo ""
# Test GPU variant only if NVIDIA runtime is available # Test MiniCPM CPU variant (doesn't require GPU)
test_minicpm_image "minicpm45v-cpu" "test-minicpm-cpu" ""
# Test MiniCPM GPU variant only if NVIDIA runtime is available
if docker info 2>/dev/null | grep -q "nvidia"; then if docker info 2>/dev/null | grep -q "nvidia"; then
test_image "minicpm45v" "test-minicpm-gpu" "--gpus all" test_minicpm_image "minicpm45v" "test-minicpm-gpu" "--gpus all"
else else
echo -e "${BLUE}Skipping GPU test (NVIDIA runtime not available)${NC}" echo -e "${BLUE}Skipping MiniCPM GPU test (NVIDIA runtime not available)${NC}"
fi
echo ""
echo -e "${BLUE}--- PaddleOCR Tests ---${NC}"
echo ""
# Test PaddleOCR CPU variant (doesn't require GPU)
test_paddleocr_image "paddleocr-cpu" "test-paddleocr-cpu" ""
# Test PaddleOCR GPU variant only if NVIDIA runtime is available
if docker info 2>/dev/null | grep -q "nvidia"; then
test_paddleocr_image "paddleocr" "test-paddleocr-gpu" "--gpus all"
else
echo -e "${BLUE}Skipping PaddleOCR GPU test (NVIDIA runtime not available)${NC}"
fi fi
echo -e "${GREEN}=== All tests passed! ===${NC}" echo -e "${GREEN}=== All tests passed! ===${NC}"

View File

@@ -22,16 +22,11 @@ interface IInvoice {
* Extract OCR text from an image using PaddleOCR * Extract OCR text from an image using PaddleOCR
*/ */
async function extractOcrText(imageBase64: string): Promise<string> { async function extractOcrText(imageBase64: string): Promise<string> {
const formData = new FormData();
const imageBuffer = Buffer.from(imageBase64, 'base64');
const blob = new Blob([imageBuffer], { type: 'image/png' });
formData.append('img', blob, 'image.png');
formData.append('outtype', 'json');
try { try {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, { const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST', method: 'POST',
body: formData, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
}); });
if (!response.ok) return ''; if (!response.ok) return '';
@@ -180,29 +175,64 @@ function hashInvoice(invoice: IInvoice): string {
/** /**
* Extract with majority voting - run until 2 passes match * Extract with majority voting - run until 2 passes match
* Optimization: Run Pass 1, OCR, and Pass 2 (after OCR) in parallel
*/ */
async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise<IInvoice> { async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise<IInvoice> {
const results: Array<{ invoice: IInvoice; hash: string }> = []; const results: Array<{ invoice: IInvoice; hash: string }> = [];
const hashCounts: Map<string, number> = new Map(); const hashCounts: Map<string, number> = new Map();
// Extract OCR text from first page const addResult = (invoice: IInvoice, passLabel: string): number => {
const ocrText = await extractOcrText(images[0]); const hash = hashInvoice(invoice);
if (ocrText) { results.push({ invoice, hash });
console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`); hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
return hashCounts.get(hash)!;
};
// OPTIMIZATION: Run Pass 1 (no OCR) in parallel with OCR -> Pass 2 (with OCR)
let ocrText = '';
const pass1Promise = extractOnce(images, 1, '').catch((err) => ({ error: err }));
// OCR then immediately Pass 2
const ocrThenPass2Promise = (async () => {
ocrText = await extractOcrText(images[0]);
if (ocrText) {
console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`);
}
return extractOnce(images, 2, ocrText).catch((err) => ({ error: err }));
})();
// Wait for both to complete
const [pass1Result, pass2Result] = await Promise.all([pass1Promise, ocrThenPass2Promise]);
// Process Pass 1 result
if ('error' in pass1Result) {
console.log(` [Pass 1] Error: ${(pass1Result as {error: unknown}).error}`);
} else {
const count = addResult(pass1Result as IInvoice, 'Pass 1');
if (count >= 2) {
console.log(` [Consensus] Reached after parallel passes`);
return pass1Result as IInvoice;
}
} }
for (let pass = 1; pass <= maxPasses; pass++) { // Process Pass 2 result
if ('error' in pass2Result) {
console.log(` [Pass 2+OCR] Error: ${(pass2Result as {error: unknown}).error}`);
} else {
const count = addResult(pass2Result as IInvoice, 'Pass 2+OCR');
if (count >= 2) {
console.log(` [Consensus] Reached after parallel passes`);
return pass2Result as IInvoice;
}
}
// Continue with passes 3+ using OCR text if no consensus yet
for (let pass = 3; pass <= maxPasses; pass++) {
try { try {
const invoice = await extractOnce(images, pass, ocrText); const invoice = await extractOnce(images, pass, ocrText);
const hash = hashInvoice(invoice); const count = addResult(invoice, `Pass ${pass}+OCR`);
results.push({ invoice, hash });
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
console.log(` [Pass ${pass}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
// Check if we have consensus (2+ matching)
const count = hashCounts.get(hash)!;
if (count >= 2) { if (count >= 2) {
console.log(` [Consensus] Reached after ${pass} passes`); console.log(` [Consensus] Reached after ${pass} passes`);
return invoice; return invoice;
@@ -267,6 +297,7 @@ function compareInvoice(
/** /**
* Find all test cases (PDF + JSON pairs) in .nogit/invoices/ * Find all test cases (PDF + JSON pairs) in .nogit/invoices/
* Priority invoices (like vodafone) run first for quick feedback
*/ */
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit/invoices'); const testDir = path.join(process.cwd(), '.nogit/invoices');
@@ -290,6 +321,22 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin
} }
} }
// Sort with priority invoices first, then alphabetically
const priorityPrefixes = ['vodafone'];
testCases.sort((a, b) => {
const aPriority = priorityPrefixes.findIndex((p) => a.name.startsWith(p));
const bPriority = priorityPrefixes.findIndex((p) => b.name.startsWith(p));
// Both have priority - sort by priority order
if (aPriority >= 0 && bPriority >= 0) return aPriority - bPriority;
// Only a has priority - a comes first
if (aPriority >= 0) return -1;
// Only b has priority - b comes first
if (bPriority >= 0) return 1;
// Neither has priority - alphabetical
return a.name.localeCompare(b.name);
});
return testCases; return testCases;
} }

258
test/test.paddleocr.ts Normal file
View File

@@ -0,0 +1,258 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
const PADDLEOCR_URL = 'http://localhost:5000';
interface IOCRResult {
text: string;
confidence: number;
box: number[][];
}
interface IOCRResponse {
success: boolean;
results: IOCRResult[];
error?: string;
}
interface IHealthResponse {
status: string;
model: string;
language: string;
gpu_enabled: boolean;
}
/**
* Convert PDF first page to PNG using ImageMagick
*/
function convertPdfToImage(pdfPath: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPath = path.join(tempDir, 'page.png');
try {
execSync(
`convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Create a simple test image with text using ImageMagick
*/
function createTestImage(text: string): string {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-'));
const outputPath = path.join(tempDir, 'test.png');
try {
execSync(
`convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`,
{ stdio: 'pipe' }
);
const imageData = fs.readFileSync(outputPath);
return imageData.toString('base64');
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
// Health check test
tap.test('should respond to health check', async () => {
const response = await fetch(`${PADDLEOCR_URL}/health`);
expect(response.ok).toBeTrue();
const data: IHealthResponse = await response.json();
expect(data.status).toEqual('healthy');
expect(data.model).toEqual('PP-OCRv4');
expect(data.language).toBeTypeofString();
expect(data.gpu_enabled).toBeTypeofBoolean();
console.log(`PaddleOCR Status: ${data.status}`);
console.log(` Model: ${data.model}`);
console.log(` Language: ${data.language}`);
console.log(` GPU Enabled: ${data.gpu_enabled}`);
});
// Base64 OCR test
tap.test('should perform OCR on base64 image', async () => {
// Create a test image with known text
const testText = 'Hello World 12345';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
// Check that at least some of the expected text was found
const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, '');
const normalizedExpected = testText.toLowerCase().replace(/\s+/g, '');
const hasPartialMatch =
normalizedExtracted.includes('hello') ||
normalizedExtracted.includes('world') ||
normalizedExtracted.includes('12345');
expect(hasPartialMatch).toBeTrue();
});
// File upload OCR test
tap.test('should perform OCR via file upload', async () => {
const testText = 'Invoice Number 98765';
console.log(`Creating test image with text: "${testText}"`);
const imageBase64 = createTestImage(testText);
const imageBuffer = Buffer.from(imageBase64, 'base64');
const formData = new FormData();
const blob = new Blob([imageBuffer], { type: 'image/png' });
formData.append('img', blob, 'test.png');
const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, {
method: 'POST',
body: formData,
});
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
expect(data.results).toBeArray();
const extractedText = data.results.map((r) => r.text).join(' ');
console.log(`Extracted text: "${extractedText}"`);
// Check that we got some text back
expect(data.results.length).toBeGreaterThan(0);
});
// OCR result structure test
tap.test('should return proper OCR result structure', async () => {
const testText = 'Test 123';
const imageBase64 = createTestImage(testText);
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const data: IOCRResponse = await response.json();
if (data.results.length > 0) {
const result = data.results[0];
// Check result has required fields
expect(result.text).toBeTypeofString();
expect(result.confidence).toBeTypeofNumber();
expect(result.box).toBeArray();
// Check bounding box structure (4 points, each with x,y)
expect(result.box.length).toEqual(4);
for (const point of result.box) {
expect(point.length).toEqual(2);
expect(point[0]).toBeTypeofNumber();
expect(point[1]).toBeTypeofNumber();
}
// Confidence should be between 0 and 1
expect(result.confidence).toBeGreaterThan(0);
expect(result.confidence).toBeLessThanOrEqual(1);
console.log(`Result structure valid:`);
console.log(` Text: "${result.text}"`);
console.log(` Confidence: ${(result.confidence * 100).toFixed(1)}%`);
console.log(` Box: ${JSON.stringify(result.box)}`);
}
});
// Test with actual invoice if available
const invoiceDir = path.join(process.cwd(), '.nogit/invoices');
if (fs.existsSync(invoiceDir)) {
const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf'));
if (pdfFiles.length > 0) {
const testPdf = pdfFiles[0];
tap.test(`should extract text from invoice: ${testPdf}`, async () => {
const pdfPath = path.join(invoiceDir, testPdf);
console.log(`Converting ${testPdf} to image...`);
const imageBase64 = convertPdfToImage(pdfPath);
console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`);
const startTime = Date.now();
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: imageBase64 }),
});
const endTime = Date.now();
const elapsedMs = endTime - startTime;
expect(response.ok).toBeTrue();
const data: IOCRResponse = await response.json();
expect(data.success).toBeTrue();
console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`);
console.log(`Found ${data.results.length} text regions`);
// Print first 10 results
const preview = data.results.slice(0, 10);
console.log(`\nFirst ${preview.length} results:`);
for (const result of preview) {
console.log(` [${(result.confidence * 100).toFixed(0)}%] ${result.text}`);
}
if (data.results.length > 10) {
console.log(` ... and ${data.results.length - 10} more`);
}
// Should find text in an invoice
expect(data.results.length).toBeGreaterThan(5);
});
}
}
// Error handling test
tap.test('should handle invalid base64 gracefully', async () => {
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: 'not-valid-base64!!!' }),
});
const data: IOCRResponse = await response.json();
// Should return success: false with error message
expect(data.success).toBeFalse();
expect(data.error).toBeTypeofString();
console.log(`Error handling works: ${data.error}`);
});
export default tap.start();