Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0d8a1ebac2 | |||
| 5a311dca2d | |||
| ab288380f1 | |||
| 30c73b24c1 |
@@ -14,7 +14,7 @@ ENV OLLAMA_ORIGINS="*"
|
|||||||
ENV CUDA_VISIBLE_DEVICES=""
|
ENV CUDA_VISIBLE_DEVICES=""
|
||||||
|
|
||||||
# Copy and setup entrypoint
|
# Copy and setup entrypoint
|
||||||
COPY image_support_files/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
COPY image_support_files/minicpm45v_entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||||
|
|
||||||
# Expose Ollama API port
|
# Expose Ollama API port
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ ENV OLLAMA_HOST="0.0.0.0"
|
|||||||
ENV OLLAMA_ORIGINS="*"
|
ENV OLLAMA_ORIGINS="*"
|
||||||
|
|
||||||
# Copy and setup entrypoint
|
# Copy and setup entrypoint
|
||||||
COPY image_support_files/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
COPY image_support_files/minicpm45v_entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||||
|
|
||||||
# Expose Ollama API port
|
# Expose Ollama API port
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
# PaddleOCR-VL GPU Variant
|
|
||||||
# Vision-Language Model for document parsing using vLLM
|
|
||||||
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
|
|
||||||
LABEL description="PaddleOCR-VL 0.9B - Vision-Language Model for document parsing"
|
|
||||||
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
|
|
||||||
|
|
||||||
# Environment configuration
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
|
||||||
ENV HF_HOME=/root/.cache/huggingface
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
|
|
||||||
# Set working directory
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Install system dependencies
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
python3.11 \
|
|
||||||
python3.11-venv \
|
|
||||||
python3.11-dev \
|
|
||||||
python3-pip \
|
|
||||||
git \
|
|
||||||
curl \
|
|
||||||
build-essential \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
|
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
|
|
||||||
|
|
||||||
# Create and activate virtual environment
|
|
||||||
RUN python -m venv /opt/venv
|
|
||||||
ENV PATH="/opt/venv/bin:$PATH"
|
|
||||||
|
|
||||||
# Install PyTorch with CUDA support
|
|
||||||
RUN pip install --no-cache-dir --upgrade pip && \
|
|
||||||
pip install --no-cache-dir \
|
|
||||||
torch==2.5.1 \
|
|
||||||
torchvision \
|
|
||||||
--index-url https://download.pytorch.org/whl/cu124
|
|
||||||
|
|
||||||
# Install vLLM 0.11.1 (first stable release with PaddleOCR-VL support)
|
|
||||||
RUN pip install --no-cache-dir \
|
|
||||||
vllm==0.11.1 \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu124
|
|
||||||
|
|
||||||
# Install additional dependencies
|
|
||||||
RUN pip install --no-cache-dir \
|
|
||||||
transformers \
|
|
||||||
accelerate \
|
|
||||||
safetensors \
|
|
||||||
pillow \
|
|
||||||
fastapi \
|
|
||||||
uvicorn[standard] \
|
|
||||||
python-multipart \
|
|
||||||
openai \
|
|
||||||
httpx
|
|
||||||
|
|
||||||
# Copy entrypoint script
|
|
||||||
COPY image_support_files/paddleocr-vl-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
|
|
||||||
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
|
|
||||||
|
|
||||||
# Expose vLLM API port
|
|
||||||
EXPOSE 8000
|
|
||||||
|
|
||||||
# Health check
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=300s --retries=3 \
|
|
||||||
CMD curl -f http://localhost:8000/health || exit 1
|
|
||||||
|
|
||||||
ENTRYPOINT ["/usr/local/bin/paddleocr-vl-entrypoint.sh"]
|
|
||||||
@@ -44,7 +44,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
|||||||
|
|
||||||
# Copy server files
|
# Copy server files
|
||||||
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
|
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
|
||||||
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
|
COPY image_support_files/paddleocr_vl_entrypoint.sh /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
|
||||||
RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
|
RUN chmod +x /usr/local/bin/paddleocr-vl-cpu-entrypoint.sh
|
||||||
|
|
||||||
# Expose API port
|
# Expose API port
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ RUN pip install --no-cache-dir \
|
|||||||
|
|
||||||
# Copy server files (same as CPU variant - it auto-detects CUDA)
|
# Copy server files (same as CPU variant - it auto-detects CUDA)
|
||||||
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
|
COPY image_support_files/paddleocr_vl_server.py /app/paddleocr_vl_server.py
|
||||||
COPY image_support_files/paddleocr-vl-cpu-entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
|
COPY image_support_files/paddleocr_vl_entrypoint.sh /usr/local/bin/paddleocr-vl-entrypoint.sh
|
||||||
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
|
RUN chmod +x /usr/local/bin/paddleocr-vl-entrypoint.sh
|
||||||
|
|
||||||
# Expose API port
|
# Expose API port
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ echo -e "${BLUE}Building ht-docker-ai images...${NC}"
|
|||||||
# Build GPU variant
|
# Build GPU variant
|
||||||
echo -e "${GREEN}Building MiniCPM-V 4.5 GPU variant...${NC}"
|
echo -e "${GREEN}Building MiniCPM-V 4.5 GPU variant...${NC}"
|
||||||
docker build \
|
docker build \
|
||||||
-f Dockerfile_minicpm45v \
|
-f Dockerfile_minicpm45v_gpu \
|
||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v \
|
||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-gpu \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-gpu \
|
||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:latest \
|
||||||
@@ -29,10 +29,10 @@ docker build \
|
|||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:minicpm45v-cpu \
|
||||||
.
|
.
|
||||||
|
|
||||||
# Build PaddleOCR-VL GPU variant (vLLM)
|
# Build PaddleOCR-VL GPU variant
|
||||||
echo -e "${GREEN}Building PaddleOCR-VL GPU variant (vLLM)...${NC}"
|
echo -e "${GREEN}Building PaddleOCR-VL GPU variant...${NC}"
|
||||||
docker build \
|
docker build \
|
||||||
-f Dockerfile_paddleocr_vl \
|
-f Dockerfile_paddleocr_vl_gpu \
|
||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl \
|
||||||
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
|
-t ${REGISTRY}/${NAMESPACE}/${IMAGE_NAME}:paddleocr-vl-gpu \
|
||||||
.
|
.
|
||||||
|
|||||||
18
changelog.md
18
changelog.md
@@ -1,5 +1,23 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2026-01-17 - 1.7.1 - fix(docker)
|
||||||
|
standardize Dockerfile and entrypoint filenames; add GPU-specific Dockerfiles and update build and test references
|
||||||
|
|
||||||
|
- Added Dockerfile_minicpm45v_gpu and image_support_files/minicpm45v_entrypoint.sh; removed the old Dockerfile_minicpm45v and docker-entrypoint.sh
|
||||||
|
- Renamed and simplified PaddleOCR entrypoint to image_support_files/paddleocr_vl_entrypoint.sh and updated CPU/GPU Dockerfile references
|
||||||
|
- Updated build-images.sh to use *_gpu Dockerfiles and clarified PaddleOCR GPU build log
|
||||||
|
- Updated test/helpers/docker.ts to point to Dockerfile_minicpm45v_gpu so tests build the GPU variant
|
||||||
|
|
||||||
|
## 2026-01-17 - 1.7.0 - feat(tests)
|
||||||
|
use Qwen2.5 (Ollama) for invoice extraction tests and add helpers for model management; normalize dates and coerce numeric fields
|
||||||
|
|
||||||
|
- Added ensureOllamaModel and ensureQwen25 test helpers to pull/check Ollama models via localhost:11434
|
||||||
|
- Updated invoices test to use qwen2.5:7b instead of MiniCPM and removed image payload from the text-only extraction step
|
||||||
|
- Increased Markdown truncate limit from 8000 to 12000 and reduced model num_predict from 2048 to 512
|
||||||
|
- Rewrote extraction prompt to require strict JSON output and added post-processing to parse/convert numeric fields
|
||||||
|
- Added normalizeDate and improved compareInvoice to normalize dates and handle numeric formatting/tolerance
|
||||||
|
- Updated test setup to ensure Qwen2.5 is available and adjusted logging/messages to reflect the Qwen2.5-based workflow
|
||||||
|
|
||||||
## 2026-01-17 - 1.6.0 - feat(paddleocr-vl)
|
## 2026-01-17 - 1.6.0 - feat(paddleocr-vl)
|
||||||
add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers
|
add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers
|
||||||
|
|
||||||
|
|||||||
@@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
echo "==================================="
|
|
||||||
echo "PaddleOCR-VL Server"
|
|
||||||
echo "==================================="
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
MODEL_NAME="${MODEL_NAME:-PaddlePaddle/PaddleOCR-VL}"
|
|
||||||
HOST="${HOST:-0.0.0.0}"
|
|
||||||
PORT="${PORT:-8000}"
|
|
||||||
MAX_BATCHED_TOKENS="${MAX_BATCHED_TOKENS:-16384}"
|
|
||||||
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.9}"
|
|
||||||
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
|
|
||||||
ENFORCE_EAGER="${ENFORCE_EAGER:-false}"
|
|
||||||
|
|
||||||
echo "Model: ${MODEL_NAME}"
|
|
||||||
echo "Host: ${HOST}"
|
|
||||||
echo "Port: ${PORT}"
|
|
||||||
echo "Max batched tokens: ${MAX_BATCHED_TOKENS}"
|
|
||||||
echo "GPU memory utilization: ${GPU_MEMORY_UTILIZATION}"
|
|
||||||
echo "Max model length: ${MAX_MODEL_LEN}"
|
|
||||||
echo "Enforce eager: ${ENFORCE_EAGER}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Check GPU availability
|
|
||||||
if command -v nvidia-smi &> /dev/null; then
|
|
||||||
echo "GPU Information:"
|
|
||||||
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
|
|
||||||
echo ""
|
|
||||||
else
|
|
||||||
echo "WARNING: nvidia-smi not found. GPU may not be available."
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Starting vLLM server..."
|
|
||||||
echo "==================================="
|
|
||||||
|
|
||||||
# Build vLLM command
|
|
||||||
VLLM_ARGS=(
|
|
||||||
serve "${MODEL_NAME}"
|
|
||||||
--trust-remote-code
|
|
||||||
--host "${HOST}"
|
|
||||||
--port "${PORT}"
|
|
||||||
--max-num-batched-tokens "${MAX_BATCHED_TOKENS}"
|
|
||||||
--gpu-memory-utilization "${GPU_MEMORY_UTILIZATION}"
|
|
||||||
--max-model-len "${MAX_MODEL_LEN}"
|
|
||||||
--no-enable-prefix-caching
|
|
||||||
--mm-processor-cache-gb 0
|
|
||||||
--served-model-name "paddleocr-vl"
|
|
||||||
--limit-mm-per-prompt '{"image": 1}'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add enforce-eager if enabled (disables CUDA graphs, saves memory)
|
|
||||||
if [ "${ENFORCE_EAGER}" = "true" ]; then
|
|
||||||
VLLM_ARGS+=(--enforce-eager)
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Start vLLM server with PaddleOCR-VL
|
|
||||||
exec vllm "${VLLM_ARGS[@]}"
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@host.today/ht-docker-ai",
|
"name": "@host.today/ht-docker-ai",
|
||||||
"version": "1.6.0",
|
"version": "1.7.1",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"private": false,
|
"private": false,
|
||||||
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
|
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ export const IMAGES = {
|
|||||||
|
|
||||||
minicpm: {
|
minicpm: {
|
||||||
name: 'minicpm45v',
|
name: 'minicpm45v',
|
||||||
dockerfile: 'Dockerfile_minicpm45v',
|
dockerfile: 'Dockerfile_minicpm45v_gpu',
|
||||||
buildContext: '.',
|
buildContext: '.',
|
||||||
containerName: 'minicpm-test',
|
containerName: 'minicpm-test',
|
||||||
ports: ['11434:11434'],
|
ports: ['11434:11434'],
|
||||||
@@ -295,3 +295,66 @@ export async function ensurePaddleOcrVlFull(): Promise<boolean> {
|
|||||||
}
|
}
|
||||||
return ensureService(IMAGES.paddleocrVlFull);
|
return ensureService(IMAGES.paddleocrVlFull);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure an Ollama model is pulled and available
|
||||||
|
* Uses the MiniCPM container (which runs Ollama) to pull the model
|
||||||
|
*/
|
||||||
|
export async function ensureOllamaModel(modelName: string): Promise<boolean> {
|
||||||
|
const OLLAMA_URL = 'http://localhost:11434';
|
||||||
|
|
||||||
|
console.log(`\n[Ollama] Ensuring model: ${modelName}`);
|
||||||
|
|
||||||
|
// Check if model exists
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
||||||
|
if (response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
const models = data.models || [];
|
||||||
|
const exists = models.some((m: { name: string }) =>
|
||||||
|
m.name === modelName || m.name.startsWith(modelName.split(':')[0])
|
||||||
|
);
|
||||||
|
|
||||||
|
if (exists) {
|
||||||
|
console.log(`[Ollama] Model already available: ${modelName}`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
console.log(`[Ollama] Cannot check models, Ollama may not be running`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pull the model
|
||||||
|
console.log(`[Ollama] Pulling model: ${modelName} (this may take a while)...`);
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${OLLAMA_URL}/api/pull`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ name: modelName, stream: false }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
console.log(`[Ollama] Model pulled successfully: ${modelName}`);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
console.log(`[Ollama] Failed to pull model: ${response.status}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.log(`[Ollama] Error pulling model: ${err}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure Qwen2.5 7B model is available (for text-only JSON extraction)
|
||||||
|
*/
|
||||||
|
export async function ensureQwen25(): Promise<boolean> {
|
||||||
|
// First ensure the Ollama service (MiniCPM container) is running
|
||||||
|
const ollamaOk = await ensureMiniCpm();
|
||||||
|
if (!ollamaOk) return false;
|
||||||
|
|
||||||
|
// Then ensure the Qwen2.5 model is pulled
|
||||||
|
return ensureOllamaModel('qwen2.5:7b');
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,11 +15,12 @@ import * as fs from 'fs';
|
|||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
import { execSync } from 'child_process';
|
import { execSync } from 'child_process';
|
||||||
import * as os from 'os';
|
import * as os from 'os';
|
||||||
import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js';
|
import { ensurePaddleOcrVlFull, ensureQwen25 } from './helpers/docker.js';
|
||||||
|
|
||||||
const PADDLEOCR_VL_URL = 'http://localhost:8000';
|
const PADDLEOCR_VL_URL = 'http://localhost:8000';
|
||||||
const OLLAMA_URL = 'http://localhost:11434';
|
const OLLAMA_URL = 'http://localhost:11434';
|
||||||
const MINICPM_MODEL = 'minicpm-v:latest';
|
// Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused)
|
||||||
|
const TEXT_MODEL = 'qwen2.5:7b';
|
||||||
|
|
||||||
interface IInvoice {
|
interface IInvoice {
|
||||||
invoice_number: string;
|
invoice_number: string;
|
||||||
@@ -87,42 +88,45 @@ async function parseDocument(imageBase64: string): Promise<string> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract invoice fields from structured Markdown using MiniCPM with image context
|
* Extract invoice fields from structured Markdown using Qwen2.5 (text-only model)
|
||||||
*/
|
*/
|
||||||
async function extractInvoiceFromMarkdown(markdown: string, images: string[]): Promise<IInvoice> {
|
async function extractInvoiceFromMarkdown(markdown: string): Promise<IInvoice> {
|
||||||
// Truncate if too long
|
// Truncate if too long
|
||||||
const truncated = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown;
|
const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown;
|
||||||
console.log(` [Extract] Processing ${truncated.length} chars of Markdown`);
|
console.log(` [Extract] Processing ${truncated.length} chars of Markdown`);
|
||||||
|
|
||||||
const prompt = `/nothink
|
const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object.
|
||||||
You are an invoice parser. Extract fields from this invoice image.
|
|
||||||
|
|
||||||
Required fields:
|
Required fields:
|
||||||
- invoice_number: The invoice/receipt number
|
- invoice_number: The invoice/receipt/document number
|
||||||
- invoice_date: Date in YYYY-MM-DD format
|
- invoice_date: Date in YYYY-MM-DD format (convert from any format)
|
||||||
- vendor_name: Company that issued the invoice
|
- vendor_name: Company that issued the invoice
|
||||||
- currency: EUR, USD, etc.
|
- currency: EUR, USD, GBP, etc.
|
||||||
- net_amount: Amount before tax
|
- net_amount: Amount before tax (number)
|
||||||
- vat_amount: Tax/VAT amount (0 if reverse charge)
|
- vat_amount: Tax/VAT amount (number, use 0 if reverse charge or not shown)
|
||||||
- total_amount: Final amount due
|
- total_amount: Final total amount (number)
|
||||||
|
|
||||||
Return ONLY a JSON object like:
|
Example output format:
|
||||||
{"invoice_number":"123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99}
|
{"invoice_number":"INV-123","invoice_date":"2022-01-28","vendor_name":"Adobe","currency":"EUR","net_amount":24.99,"vat_amount":0,"total_amount":24.99}
|
||||||
|
|
||||||
Use null for missing strings, 0 for missing numbers. No explanation.
|
Rules:
|
||||||
|
- Return ONLY the JSON object, no explanation or markdown
|
||||||
|
- Use null for missing string fields
|
||||||
|
- Use 0 for missing numeric fields
|
||||||
|
- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28")
|
||||||
|
- Extract numbers without currency symbols
|
||||||
|
|
||||||
OCR text from the invoice (for reference):
|
OCR Text:
|
||||||
---
|
|
||||||
${truncated}
|
${truncated}
|
||||||
---`;
|
|
||||||
|
JSON:`;
|
||||||
|
|
||||||
const payload = {
|
const payload = {
|
||||||
model: MINICPM_MODEL,
|
model: TEXT_MODEL,
|
||||||
prompt,
|
prompt,
|
||||||
images, // Send the actual image to MiniCPM
|
|
||||||
stream: true,
|
stream: true,
|
||||||
options: {
|
options: {
|
||||||
num_predict: 2048,
|
num_predict: 512,
|
||||||
temperature: 0.1,
|
temperature: 0.1,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -173,26 +177,41 @@ ${truncated}
|
|||||||
}
|
}
|
||||||
|
|
||||||
const jsonStr = fullText.substring(startIdx, endIdx);
|
const jsonStr = fullText.substring(startIdx, endIdx);
|
||||||
return JSON.parse(jsonStr);
|
const parsed = JSON.parse(jsonStr);
|
||||||
|
|
||||||
|
// Ensure numeric fields are actually numbers
|
||||||
|
return {
|
||||||
|
invoice_number: parsed.invoice_number || null,
|
||||||
|
invoice_date: parsed.invoice_date || null,
|
||||||
|
vendor_name: parsed.vendor_name || null,
|
||||||
|
currency: parsed.currency || 'EUR',
|
||||||
|
net_amount: parseFloat(parsed.net_amount) || 0,
|
||||||
|
vat_amount: parseFloat(parsed.vat_amount) || 0,
|
||||||
|
total_amount: parseFloat(parsed.total_amount) || 0,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with MiniCPM
|
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only)
|
||||||
*/
|
*/
|
||||||
async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
|
async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
|
||||||
// Parse document with full pipeline
|
// Parse document with full pipeline (PaddleOCR-VL)
|
||||||
const markdown = await parseDocument(images[0]);
|
const markdown = await parseDocument(images[0]);
|
||||||
console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
|
console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
|
||||||
|
|
||||||
// Extract invoice fields from Markdown with image context
|
// Extract invoice fields from Markdown using text-only model (no images)
|
||||||
return extractInvoiceFromMarkdown(markdown, images);
|
return extractInvoiceFromMarkdown(markdown);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a hash of invoice for comparison (using key fields)
|
* Create a hash of invoice for comparison (using key fields)
|
||||||
*/
|
*/
|
||||||
function hashInvoice(invoice: IInvoice): string {
|
function hashInvoice(invoice: IInvoice): string {
|
||||||
return `${invoice.invoice_number}|${invoice.invoice_date}|${invoice.total_amount.toFixed(2)}`;
|
// Ensure total_amount is a number
|
||||||
|
const amount = typeof invoice.total_amount === 'number'
|
||||||
|
? invoice.total_amount.toFixed(2)
|
||||||
|
: String(invoice.total_amount || 0);
|
||||||
|
return `${invoice.invoice_number}|${invoice.invoice_date}|${amount}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -243,6 +262,43 @@ async function extractWithConsensus(images: string[], invoiceName: string, maxPa
|
|||||||
return best.invoice;
|
return best.invoice;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize date to YYYY-MM-DD format
|
||||||
|
*/
|
||||||
|
function normalizeDate(dateStr: string | null): string {
|
||||||
|
if (!dateStr) return '';
|
||||||
|
|
||||||
|
// Already in correct format
|
||||||
|
if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) {
|
||||||
|
return dateStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle DD-MMM-YYYY format (e.g., "28-JUN-2022")
|
||||||
|
const monthMap: Record<string, string> = {
|
||||||
|
JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06',
|
||||||
|
JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12',
|
||||||
|
};
|
||||||
|
|
||||||
|
const match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i);
|
||||||
|
if (match) {
|
||||||
|
const day = match[1].padStart(2, '0');
|
||||||
|
const month = monthMap[match[2].toUpperCase()] || '01';
|
||||||
|
const year = match[3];
|
||||||
|
return `${year}-${month}-${day}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle DD/MM/YYYY or DD.MM.YYYY
|
||||||
|
const match2 = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/);
|
||||||
|
if (match2) {
|
||||||
|
const day = match2[1].padStart(2, '0');
|
||||||
|
const month = match2[2].padStart(2, '0');
|
||||||
|
const year = match2[3];
|
||||||
|
return `${year}-${month}-${day}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return dateStr;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compare extracted invoice against expected
|
* Compare extracted invoice against expected
|
||||||
*/
|
*/
|
||||||
@@ -259,8 +315,10 @@ function compareInvoice(
|
|||||||
errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`);
|
errors.push(`invoice_number: expected "${expected.invoice_number}", got "${extracted.invoice_number}"`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compare date
|
// Compare date (normalize format first)
|
||||||
if (extracted.invoice_date !== expected.invoice_date) {
|
const extDate = normalizeDate(extracted.invoice_date);
|
||||||
|
const expDate = normalizeDate(expected.invoice_date);
|
||||||
|
if (extDate !== expDate) {
|
||||||
errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`);
|
errors.push(`invoice_date: expected "${expected.invoice_date}", got "${extracted.invoice_date}"`);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -317,9 +375,9 @@ tap.test('setup: ensure Docker containers are running', async () => {
|
|||||||
const paddleOk = await ensurePaddleOcrVlFull();
|
const paddleOk = await ensurePaddleOcrVlFull();
|
||||||
expect(paddleOk).toBeTrue();
|
expect(paddleOk).toBeTrue();
|
||||||
|
|
||||||
// Ensure MiniCPM is running (for field extraction from Markdown)
|
// Ensure Qwen2.5 is available (for text-only JSON extraction)
|
||||||
const minicpmOk = await ensureMiniCpm();
|
const qwenOk = await ensureQwen25();
|
||||||
expect(minicpmOk).toBeTrue();
|
expect(qwenOk).toBeTrue();
|
||||||
|
|
||||||
console.log('\n[Setup] All containers ready!\n');
|
console.log('\n[Setup] All containers ready!\n');
|
||||||
});
|
});
|
||||||
@@ -380,7 +438,7 @@ tap.test('summary', async () => {
|
|||||||
console.log(`\n======================================================`);
|
console.log(`\n======================================================`);
|
||||||
console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`);
|
console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`);
|
||||||
console.log(`======================================================`);
|
console.log(`======================================================`);
|
||||||
console.log(` Method: PaddleOCR-VL Full Pipeline -> MiniCPM`);
|
console.log(` Method: PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`);
|
||||||
console.log(` Passed: ${passedCount}/${totalInvoices}`);
|
console.log(` Passed: ${passedCount}/${totalInvoices}`);
|
||||||
console.log(` Failed: ${failedCount}/${totalInvoices}`);
|
console.log(` Failed: ${failedCount}/${totalInvoices}`);
|
||||||
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);
|
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);
|
||||||
|
|||||||
Reference in New Issue
Block a user