feat(paddleocr): add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows

2026-01-16 13:23:01 +00:00
parent 67c38eeb67
commit bec379e9ca
10 changed files with 624 additions and 71 deletions
@@ -1,6 +1,6 @@
 # PaddleOCR CPU Variant
 # OCR processing optimized for CPU-only inference
-FROM python:3.10-slim
+FROM python:3.10-slim-bookworm

 LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
 LABEL description="PaddleOCR PP-OCRv4 - CPU optimized"
@@ -21,13 +21,14 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
+    libgomp1 \
    curl \
    && rm -rf /var/lib/apt/lists/*

-# Install Python dependencies (CPU version of PaddlePaddle)
+# Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions)
 RUN pip install --no-cache-dir \
-    paddlepaddle \
-    paddleocr \
+    paddlepaddle==2.6.2 \
+    paddleocr==2.8.1 \
    fastapi \
    uvicorn[standard] \
    python-multipart \
@@ -39,10 +40,8 @@ COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py
 COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh
 RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh

-# Pre-download OCR models during build (PP-OCRv4)
-RUN python -c "from paddleocr import PaddleOCR; \
-    ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=True); \
-    print('English model downloaded')"
+# Note: OCR models will be downloaded on first run
+# This avoids build-time segfaults with certain CPU architectures

 # Expose API port
 EXPOSE 5000