From bec379e9ca6e6fe86df8d6ee62ee8527e2bade02 Mon Sep 17 00:00:00 2001 From: Juergen Kunz Date: Fri, 16 Jan 2026 13:23:01 +0000 Subject: [PATCH] feat(paddleocr): add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows --- .gitea/workflows/docker_nottags.yaml | 67 ++++++ .gitea/workflows/docker_tags.yaml | 101 ++++++++++ Dockerfile_paddleocr | 12 +- Dockerfile_paddleocr_cpu | 15 +- changelog.md | 10 + image_support_files/paddleocr_server.py | 65 +++--- npmextra.json | 5 +- readme.hints.md | 75 +++++++ test/test.invoices.ts | 87 ++++++-- test/test.paddleocr.ts | 258 ++++++++++++++++++++++++ 10 files changed, 624 insertions(+), 71 deletions(-) create mode 100644 .gitea/workflows/docker_nottags.yaml create mode 100644 .gitea/workflows/docker_tags.yaml create mode 100644 test/test.paddleocr.ts diff --git a/.gitea/workflows/docker_nottags.yaml b/.gitea/workflows/docker_nottags.yaml new file mode 100644 index 0000000..1564514 --- /dev/null +++ b/.gitea/workflows/docker_nottags.yaml @@ -0,0 +1,67 @@ +name: Docker (no tags) + +on: + push: + tags-ignore: + - '**' + +env: + IMAGE: code.foss.global/host.today/ht-docker-node:npmci + NPMCI_COMPUTED_REPOURL: https://${{gitea.repository_owner}}:${{secrets.GITEA_TOKEN}}@gitea.lossless.digital/${{gitea.repository}}.git + NPMCI_LOGIN_DOCKER_DOCKERREGISTRY: ${{ secrets.NPMCI_LOGIN_DOCKER_DOCKERREGISTRY }} + +jobs: + security: + runs-on: ubuntu-latest + container: + image: ${{ env.IMAGE }} + continue-on-error: true + + steps: + - uses: actions/checkout@v3 + + - name: Prepare + run: | + pnpm install -g pnpm + pnpm install -g @ship.zone/npmci + npmci npm prepare + + - name: Audit production dependencies + run: | + npmci command npm config set registry https://registry.npmjs.org + npmci command pnpm audit --audit-level=high --prod + continue-on-error: true + + - name: Audit development dependencies + run: | + npmci command npm config set registry https://registry.npmjs.org + npmci command pnpm audit --audit-level=high --dev + continue-on-error: true + + test: + needs: security + runs-on: ubuntu-latest + container: + image: ${{ env.IMAGE }} + + steps: + - uses: actions/checkout@v3 + + - name: Prepare + run: | + pnpm install -g pnpm + pnpm install -g @ship.zone/npmci + npmci npm prepare + + - name: Test stable + run: | + npmci node install stable + npmci npm install + npmci npm test + continue-on-error: true + + - name: Test build + run: | + npmci node install stable + npmci npm install + npmci command npm run build diff --git a/.gitea/workflows/docker_tags.yaml b/.gitea/workflows/docker_tags.yaml new file mode 100644 index 0000000..f490c65 --- /dev/null +++ b/.gitea/workflows/docker_tags.yaml @@ -0,0 +1,101 @@ +name: Docker (tags) + +on: + push: + tags: + - '*' + +env: + IMAGE: code.foss.global/host.today/ht-docker-node:npmci + NPMCI_COMPUTED_REPOURL: https://${{gitea.repository_owner}}:${{secrets.GITEA_TOKEN}}@gitea.lossless.digital/${{gitea.repository}}.git + NPMCI_LOGIN_DOCKER_DOCKERREGISTRY: ${{ secrets.NPMCI_LOGIN_DOCKER_DOCKERREGISTRY }} + +jobs: + security: + runs-on: ubuntu-latest + container: + image: ${{ env.IMAGE }} + continue-on-error: true + + steps: + - uses: actions/checkout@v3 + + - name: Prepare + run: | + pnpm install -g pnpm + pnpm install -g @ship.zone/npmci + npmci npm prepare + + - name: Audit production dependencies + run: | + npmci command npm config set registry https://registry.npmjs.org + npmci command pnpm audit --audit-level=high --prod + continue-on-error: true + + - name: Audit development dependencies + run: | + npmci command npm config set registry https://registry.npmjs.org + npmci command pnpm audit --audit-level=high --dev + continue-on-error: true + + test: + needs: security + runs-on: ubuntu-latest + container: + image: ${{ env.IMAGE }} + + steps: + - uses: actions/checkout@v3 + + - name: Prepare + run: | + pnpm install -g pnpm + pnpm install -g @ship.zone/npmci + npmci npm prepare + + - name: Test stable + run: | + npmci node install stable + npmci npm install + npmci npm test + continue-on-error: true + + - name: Test build + run: | + npmci node install stable + npmci npm install + npmci command npm run build + + release: + needs: test + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + container: + image: code.foss.global/host.today/ht-docker-dbase:npmci + + steps: + - uses: actions/checkout@v3 + + - name: Prepare + run: | + pnpm install -g pnpm + pnpm install -g @ship.zone/npmci + + - name: Release + run: | + npmci docker login + npmci docker build + npmci docker push code.foss.global + + metadata: + needs: test + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + container: + image: ${{ env.IMAGE }} + + steps: + - uses: actions/checkout@v3 + + - name: Trigger + run: npmci trigger diff --git a/Dockerfile_paddleocr b/Dockerfile_paddleocr index 95b5001..fe0e681 100644 --- a/Dockerfile_paddleocr +++ b/Dockerfile_paddleocr @@ -1,6 +1,6 @@ # PaddleOCR GPU Variant # OCR processing with NVIDIA GPU support using PaddlePaddle -FROM paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 +FROM paddlepaddle/paddle:2.6.2-gpu-cuda11.7-cudnn8.4-trt8.4 LABEL maintainer="Task Venture Capital GmbH " LABEL description="PaddleOCR PP-OCRv4 - GPU optimized" @@ -22,9 +22,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ && rm -rf /var/lib/apt/lists/* -# Install Python dependencies +# Install Python dependencies (using stable paddleocr 2.x) RUN pip install --no-cache-dir \ - paddleocr \ + paddleocr==2.8.1 \ fastapi \ uvicorn[standard] \ python-multipart \ @@ -36,10 +36,8 @@ COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh -# Pre-download OCR models during build (PP-OCRv4) -RUN python -c "from paddleocr import PaddleOCR; \ - ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=True); \ - print('English model downloaded')" +# Note: OCR models will be downloaded on first run +# This ensures compatibility across different GPU architectures # Expose API port EXPOSE 5000 diff --git a/Dockerfile_paddleocr_cpu b/Dockerfile_paddleocr_cpu index 8281ec1..36386d8 100644 --- a/Dockerfile_paddleocr_cpu +++ b/Dockerfile_paddleocr_cpu @@ -1,6 +1,6 @@ # PaddleOCR CPU Variant # OCR processing optimized for CPU-only inference -FROM python:3.10-slim +FROM python:3.10-slim-bookworm LABEL maintainer="Task Venture Capital GmbH " LABEL description="PaddleOCR PP-OCRv4 - CPU optimized" @@ -21,13 +21,14 @@ WORKDIR /app RUN apt-get update && apt-get install -y --no-install-recommends \ libgl1-mesa-glx \ libglib2.0-0 \ + libgomp1 \ curl \ && rm -rf /var/lib/apt/lists/* -# Install Python dependencies (CPU version of PaddlePaddle) +# Install Python dependencies (CPU version of PaddlePaddle - using stable 2.x versions) RUN pip install --no-cache-dir \ - paddlepaddle \ - paddleocr \ + paddlepaddle==2.6.2 \ + paddleocr==2.8.1 \ fastapi \ uvicorn[standard] \ python-multipart \ @@ -39,10 +40,8 @@ COPY image_support_files/paddleocr_server.py /app/paddleocr_server.py COPY image_support_files/paddleocr-entrypoint.sh /usr/local/bin/paddleocr-entrypoint.sh RUN chmod +x /usr/local/bin/paddleocr-entrypoint.sh -# Pre-download OCR models during build (PP-OCRv4) -RUN python -c "from paddleocr import PaddleOCR; \ - ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=True); \ - print('English model downloaded')" +# Note: OCR models will be downloaded on first run +# This avoids build-time segfaults with certain CPU architectures # Expose API port EXPOSE 5000 diff --git a/changelog.md b/changelog.md index 068cb84..0ddab7a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## 2026-01-16 - 1.3.0 - feat(paddleocr) +add PaddleOCR OCR service (Docker images, server, tests, docs) and CI workflows + +- Add GPU and CPU PaddleOCR Dockerfiles; pin paddlepaddle/paddle and paddleocr to stable 2.x and install libgomp1 for CPU builds +- Avoid pre-downloading OCR models at build-time to prevent build-time segfaults; models are downloaded on first run +- Refactor PaddleOCR FastAPI server: respect CUDA_VISIBLE_DEVICES, support per-request language, cache default language instance and create temporary instances for other languages +- Add comprehensive tests (test.paddleocr.ts) and improve invoice extraction tests (parallelize passes, JSON OCR API usage, prioritize certain test cases) +- Add Gitea CI workflows for tag and non-tag Docker runs and release pipeline (docker build/push, metadata trigger) +- Update documentation (readme.hints.md) with PaddleOCR usage and add docker registry entry to npmextra.json + ## 2026-01-16 - 1.2.0 - feat(paddleocr) add PaddleOCR support: Docker images, FastAPI server, entrypoint and tests diff --git a/image_support_files/paddleocr_server.py b/image_support_files/paddleocr_server.py index be23a2a..f4650e9 100644 --- a/image_support_files/paddleocr_server.py +++ b/image_support_files/paddleocr_server.py @@ -26,6 +26,7 @@ logger = logging.getLogger(__name__) # Environment configuration OCR_LANGUAGE = os.environ.get('OCR_LANGUAGE', 'en') +# GPU is controlled via CUDA_VISIBLE_DEVICES environment variable USE_GPU = os.environ.get('CUDA_VISIBLE_DEVICES', '') != '-1' # Initialize FastAPI app @@ -72,19 +73,29 @@ class HealthResponse(BaseModel): gpu_enabled: bool -def get_ocr() -> PaddleOCR: +def get_ocr(lang: Optional[str] = None) -> PaddleOCR: """Get or initialize the OCR instance""" global ocr_instance - if ocr_instance is None: - logger.info(f"Initializing PaddleOCR with language={OCR_LANGUAGE}, use_gpu={USE_GPU}") - ocr_instance = PaddleOCR( - use_angle_cls=True, - lang=OCR_LANGUAGE, - use_gpu=USE_GPU, - show_log=False - ) - logger.info("PaddleOCR initialized successfully") - return ocr_instance + use_lang = lang or OCR_LANGUAGE + + # Return cached instance if same language + if ocr_instance is not None and lang is None: + return ocr_instance + + logger.info(f"Initializing PaddleOCR with language={use_lang}, use_gpu={USE_GPU}") + new_ocr = PaddleOCR( + use_angle_cls=True, + lang=use_lang, + use_gpu=USE_GPU, + show_log=False + ) + + # Cache the default language instance + if lang is None: + ocr_instance = new_ocr + + logger.info("PaddleOCR initialized successfully") + return new_ocr def decode_base64_image(base64_string: str) -> np.ndarray: @@ -176,20 +187,12 @@ async def ocr_base64(request: OCRRequest): image = decode_base64_image(request.image) # Get OCR instance (use request language if provided) - ocr = get_ocr() - - # If a different language is requested, create a new instance if request.language and request.language != OCR_LANGUAGE: - logger.info(f"Creating OCR instance for language: {request.language}") - temp_ocr = PaddleOCR( - use_angle_cls=True, - lang=request.language, - use_gpu=USE_GPU, - show_log=False - ) - result = temp_ocr.ocr(image, cls=True) + ocr = get_ocr(request.language) else: - result = ocr.ocr(image, cls=True) + ocr = get_ocr() + + result = ocr.ocr(image, cls=True) # Process results results = process_ocr_result(result) @@ -228,20 +231,12 @@ async def ocr_upload( image_array = np.array(image) # Get OCR instance - ocr = get_ocr() - - # If a different language is requested, create a new instance if language and language != OCR_LANGUAGE: - logger.info(f"Creating OCR instance for language: {language}") - temp_ocr = PaddleOCR( - use_angle_cls=True, - lang=language, - use_gpu=USE_GPU, - show_log=False - ) - result = temp_ocr.ocr(image_array, cls=True) + ocr = get_ocr(language) else: - result = ocr.ocr(image_array, cls=True) + ocr = get_ocr() + + result = ocr.ocr(image_array, cls=True) # Process results results = process_ocr_result(result) diff --git a/npmextra.json b/npmextra.json index c49f9f0..622ff98 100644 --- a/npmextra.json +++ b/npmextra.json @@ -1,7 +1,10 @@ { "npmci": { "npmGlobalTools": [], - "npmAccessLevel": "public" + "npmAccessLevel": "public", + "dockerRegistries": [ + "code.foss.global" + ] }, "gitzone": { "projectType": "docker", diff --git a/readme.hints.md b/readme.hints.md index 29a9824..e7abbc0 100644 --- a/readme.hints.md +++ b/readme.hints.md @@ -77,6 +77,81 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CPU variant has longer `start-period` (120s) due to slower startup. +## PaddleOCR + +### Overview + +PaddleOCR is a standalone OCR service using PaddlePaddle's PP-OCRv4 model. It provides: + +- Text detection and recognition +- Multi-language support +- FastAPI REST API +- GPU and CPU variants + +### Docker Images + +| Tag | Description | +|-----|-------------| +| `paddleocr` | GPU variant (default) | +| `paddleocr-gpu` | GPU variant (alias) | +| `paddleocr-cpu` | CPU-only variant | + +### API Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check with model info | +| `/ocr` | POST | OCR with base64 image (JSON body) | +| `/ocr/upload` | POST | OCR with file upload (multipart form) | + +### Request/Response Format + +**POST /ocr (JSON)** +```json +{ + "image": "", + "language": "en" // optional +} +``` + +**POST /ocr/upload (multipart)** +- `img`: image file +- `language`: optional language code + +**Response** +```json +{ + "success": true, + "results": [ + { + "text": "Invoice #12345", + "confidence": 0.98, + "box": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + } + ] +} +``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `OCR_LANGUAGE` | `en` | Default language for OCR | +| `SERVER_PORT` | `5000` | Server port | +| `SERVER_HOST` | `0.0.0.0` | Server host | +| `CUDA_VISIBLE_DEVICES` | (auto) | Set to `-1` for CPU-only | + +### Performance + +- **GPU**: ~1-3 seconds per page +- **CPU**: ~10-30 seconds per page + +### Supported Languages + +Common language codes: `en` (English), `ch` (Chinese), `de` (German), `fr` (French), `es` (Spanish), `ja` (Japanese), `ko` (Korean) + +--- + ## Adding New Models To add a new model variant: diff --git a/test/test.invoices.ts b/test/test.invoices.ts index db5d60e..a3fda84 100644 --- a/test/test.invoices.ts +++ b/test/test.invoices.ts @@ -22,16 +22,11 @@ interface IInvoice { * Extract OCR text from an image using PaddleOCR */ async function extractOcrText(imageBase64: string): Promise { - const formData = new FormData(); - const imageBuffer = Buffer.from(imageBase64, 'base64'); - const blob = new Blob([imageBuffer], { type: 'image/png' }); - formData.append('img', blob, 'image.png'); - formData.append('outtype', 'json'); - try { const response = await fetch(`${PADDLEOCR_URL}/ocr`, { method: 'POST', - body: formData, + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ image: imageBase64 }), }); if (!response.ok) return ''; @@ -180,29 +175,64 @@ function hashInvoice(invoice: IInvoice): string { /** * Extract with majority voting - run until 2 passes match + * Optimization: Run Pass 1, OCR, and Pass 2 (after OCR) in parallel */ async function extractWithConsensus(images: string[], invoiceName: string, maxPasses: number = 5): Promise { const results: Array<{ invoice: IInvoice; hash: string }> = []; const hashCounts: Map = new Map(); - // Extract OCR text from first page - const ocrText = await extractOcrText(images[0]); - if (ocrText) { - console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`); + const addResult = (invoice: IInvoice, passLabel: string): number => { + const hash = hashInvoice(invoice); + results.push({ invoice, hash }); + hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); + console.log(` [${passLabel}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); + return hashCounts.get(hash)!; + }; + + // OPTIMIZATION: Run Pass 1 (no OCR) in parallel with OCR -> Pass 2 (with OCR) + let ocrText = ''; + const pass1Promise = extractOnce(images, 1, '').catch((err) => ({ error: err })); + + // OCR then immediately Pass 2 + const ocrThenPass2Promise = (async () => { + ocrText = await extractOcrText(images[0]); + if (ocrText) { + console.log(` [OCR] Extracted ${ocrText.split('\n').length} text lines`); + } + return extractOnce(images, 2, ocrText).catch((err) => ({ error: err })); + })(); + + // Wait for both to complete + const [pass1Result, pass2Result] = await Promise.all([pass1Promise, ocrThenPass2Promise]); + + // Process Pass 1 result + if ('error' in pass1Result) { + console.log(` [Pass 1] Error: ${(pass1Result as {error: unknown}).error}`); + } else { + const count = addResult(pass1Result as IInvoice, 'Pass 1'); + if (count >= 2) { + console.log(` [Consensus] Reached after parallel passes`); + return pass1Result as IInvoice; + } } - for (let pass = 1; pass <= maxPasses; pass++) { + // Process Pass 2 result + if ('error' in pass2Result) { + console.log(` [Pass 2+OCR] Error: ${(pass2Result as {error: unknown}).error}`); + } else { + const count = addResult(pass2Result as IInvoice, 'Pass 2+OCR'); + if (count >= 2) { + console.log(` [Consensus] Reached after parallel passes`); + return pass2Result as IInvoice; + } + } + + // Continue with passes 3+ using OCR text if no consensus yet + for (let pass = 3; pass <= maxPasses; pass++) { try { const invoice = await extractOnce(images, pass, ocrText); - const hash = hashInvoice(invoice); + const count = addResult(invoice, `Pass ${pass}+OCR`); - results.push({ invoice, hash }); - hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1); - - console.log(` [Pass ${pass}] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`); - - // Check if we have consensus (2+ matching) - const count = hashCounts.get(hash)!; if (count >= 2) { console.log(` [Consensus] Reached after ${pass} passes`); return invoice; @@ -267,6 +297,7 @@ function compareInvoice( /** * Find all test cases (PDF + JSON pairs) in .nogit/invoices/ + * Priority invoices (like vodafone) run first for quick feedback */ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> { const testDir = path.join(process.cwd(), '.nogit/invoices'); @@ -290,6 +321,22 @@ function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: strin } } + // Sort with priority invoices first, then alphabetically + const priorityPrefixes = ['vodafone']; + testCases.sort((a, b) => { + const aPriority = priorityPrefixes.findIndex((p) => a.name.startsWith(p)); + const bPriority = priorityPrefixes.findIndex((p) => b.name.startsWith(p)); + + // Both have priority - sort by priority order + if (aPriority >= 0 && bPriority >= 0) return aPriority - bPriority; + // Only a has priority - a comes first + if (aPriority >= 0) return -1; + // Only b has priority - b comes first + if (bPriority >= 0) return 1; + // Neither has priority - alphabetical + return a.name.localeCompare(b.name); + }); + return testCases; } diff --git a/test/test.paddleocr.ts b/test/test.paddleocr.ts new file mode 100644 index 0000000..9fe6fb2 --- /dev/null +++ b/test/test.paddleocr.ts @@ -0,0 +1,258 @@ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; +import * as os from 'os'; + +const PADDLEOCR_URL = 'http://localhost:5000'; + +interface IOCRResult { + text: string; + confidence: number; + box: number[][]; +} + +interface IOCRResponse { + success: boolean; + results: IOCRResult[]; + error?: string; +} + +interface IHealthResponse { + status: string; + model: string; + language: string; + gpu_enabled: boolean; +} + +/** + * Convert PDF first page to PNG using ImageMagick + */ +function convertPdfToImage(pdfPath: string): string { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-')); + const outputPath = path.join(tempDir, 'page.png'); + + try { + execSync( + `convert -density 200 -quality 90 "${pdfPath}[0]" -background white -alpha remove "${outputPath}"`, + { stdio: 'pipe' } + ); + + const imageData = fs.readFileSync(outputPath); + return imageData.toString('base64'); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +/** + * Create a simple test image with text using ImageMagick + */ +function createTestImage(text: string): string { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-image-')); + const outputPath = path.join(tempDir, 'test.png'); + + try { + execSync( + `convert -size 400x100 xc:white -font DejaVu-Sans -pointsize 24 -fill black -gravity center -annotate 0 "${text}" "${outputPath}"`, + { stdio: 'pipe' } + ); + + const imageData = fs.readFileSync(outputPath); + return imageData.toString('base64'); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } +} + +// Health check test +tap.test('should respond to health check', async () => { + const response = await fetch(`${PADDLEOCR_URL}/health`); + expect(response.ok).toBeTrue(); + + const data: IHealthResponse = await response.json(); + expect(data.status).toEqual('healthy'); + expect(data.model).toEqual('PP-OCRv4'); + expect(data.language).toBeTypeofString(); + expect(data.gpu_enabled).toBeTypeofBoolean(); + + console.log(`PaddleOCR Status: ${data.status}`); + console.log(` Model: ${data.model}`); + console.log(` Language: ${data.language}`); + console.log(` GPU Enabled: ${data.gpu_enabled}`); +}); + +// Base64 OCR test +tap.test('should perform OCR on base64 image', async () => { + // Create a test image with known text + const testText = 'Hello World 12345'; + console.log(`Creating test image with text: "${testText}"`); + + const imageBase64 = createTestImage(testText); + + const response = await fetch(`${PADDLEOCR_URL}/ocr`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ image: imageBase64 }), + }); + + expect(response.ok).toBeTrue(); + + const data: IOCRResponse = await response.json(); + expect(data.success).toBeTrue(); + expect(data.results).toBeArray(); + + const extractedText = data.results.map((r) => r.text).join(' '); + console.log(`Extracted text: "${extractedText}"`); + + // Check that we got some text back + expect(data.results.length).toBeGreaterThan(0); + + // Check that at least some of the expected text was found + const normalizedExtracted = extractedText.toLowerCase().replace(/\s+/g, ''); + const normalizedExpected = testText.toLowerCase().replace(/\s+/g, ''); + const hasPartialMatch = + normalizedExtracted.includes('hello') || + normalizedExtracted.includes('world') || + normalizedExtracted.includes('12345'); + + expect(hasPartialMatch).toBeTrue(); +}); + +// File upload OCR test +tap.test('should perform OCR via file upload', async () => { + const testText = 'Invoice Number 98765'; + console.log(`Creating test image with text: "${testText}"`); + + const imageBase64 = createTestImage(testText); + const imageBuffer = Buffer.from(imageBase64, 'base64'); + + const formData = new FormData(); + const blob = new Blob([imageBuffer], { type: 'image/png' }); + formData.append('img', blob, 'test.png'); + + const response = await fetch(`${PADDLEOCR_URL}/ocr/upload`, { + method: 'POST', + body: formData, + }); + + expect(response.ok).toBeTrue(); + + const data: IOCRResponse = await response.json(); + expect(data.success).toBeTrue(); + expect(data.results).toBeArray(); + + const extractedText = data.results.map((r) => r.text).join(' '); + console.log(`Extracted text: "${extractedText}"`); + + // Check that we got some text back + expect(data.results.length).toBeGreaterThan(0); +}); + +// OCR result structure test +tap.test('should return proper OCR result structure', async () => { + const testText = 'Test 123'; + const imageBase64 = createTestImage(testText); + + const response = await fetch(`${PADDLEOCR_URL}/ocr`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ image: imageBase64 }), + }); + + const data: IOCRResponse = await response.json(); + + if (data.results.length > 0) { + const result = data.results[0]; + + // Check result has required fields + expect(result.text).toBeTypeofString(); + expect(result.confidence).toBeTypeofNumber(); + expect(result.box).toBeArray(); + + // Check bounding box structure (4 points, each with x,y) + expect(result.box.length).toEqual(4); + for (const point of result.box) { + expect(point.length).toEqual(2); + expect(point[0]).toBeTypeofNumber(); + expect(point[1]).toBeTypeofNumber(); + } + + // Confidence should be between 0 and 1 + expect(result.confidence).toBeGreaterThan(0); + expect(result.confidence).toBeLessThanOrEqual(1); + + console.log(`Result structure valid:`); + console.log(` Text: "${result.text}"`); + console.log(` Confidence: ${(result.confidence * 100).toFixed(1)}%`); + console.log(` Box: ${JSON.stringify(result.box)}`); + } +}); + +// Test with actual invoice if available +const invoiceDir = path.join(process.cwd(), '.nogit/invoices'); +if (fs.existsSync(invoiceDir)) { + const pdfFiles = fs.readdirSync(invoiceDir).filter((f) => f.endsWith('.pdf')); + + if (pdfFiles.length > 0) { + const testPdf = pdfFiles[0]; + tap.test(`should extract text from invoice: ${testPdf}`, async () => { + const pdfPath = path.join(invoiceDir, testPdf); + console.log(`Converting ${testPdf} to image...`); + + const imageBase64 = convertPdfToImage(pdfPath); + console.log(`Image size: ${(imageBase64.length / 1024).toFixed(1)} KB`); + + const startTime = Date.now(); + + const response = await fetch(`${PADDLEOCR_URL}/ocr`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ image: imageBase64 }), + }); + + const endTime = Date.now(); + const elapsedMs = endTime - startTime; + + expect(response.ok).toBeTrue(); + + const data: IOCRResponse = await response.json(); + expect(data.success).toBeTrue(); + + console.log(`OCR completed in ${(elapsedMs / 1000).toFixed(2)}s`); + console.log(`Found ${data.results.length} text regions`); + + // Print first 10 results + const preview = data.results.slice(0, 10); + console.log(`\nFirst ${preview.length} results:`); + for (const result of preview) { + console.log(` [${(result.confidence * 100).toFixed(0)}%] ${result.text}`); + } + + if (data.results.length > 10) { + console.log(` ... and ${data.results.length - 10} more`); + } + + // Should find text in an invoice + expect(data.results.length).toBeGreaterThan(5); + }); + } +} + +// Error handling test +tap.test('should handle invalid base64 gracefully', async () => { + const response = await fetch(`${PADDLEOCR_URL}/ocr`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ image: 'not-valid-base64!!!' }), + }); + + const data: IOCRResponse = await response.json(); + + // Should return success: false with error message + expect(data.success).toBeFalse(); + expect(data.error).toBeTypeofString(); + console.log(`Error handling works: ${data.error}`); +}); + +export default tap.start();