5 Commits

Author SHA1 Message Date
6dbd06073b v1.13.2
Some checks failed
Docker (tags) / security (push) Successful in 31s
Docker (tags) / test (push) Failing after 40s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-18 23:00:24 +00:00
ae28a64902 fix(tests): stabilize OCR extraction tests and manage GPU containers 2026-01-18 23:00:24 +00:00
09ea7440e8 update 2026-01-18 15:54:16 +00:00
177e87d3b8 v1.13.1
Some checks failed
Docker (tags) / security (push) Successful in 24s
Docker (tags) / test (push) Failing after 40s
Docker (tags) / release (push) Has been skipped
Docker (tags) / metadata (push) Has been skipped
2026-01-18 13:58:26 +00:00
17ea7717eb fix(image_support_files): remove PaddleOCR-VL server scripts from image_support_files 2026-01-18 13:58:26 +00:00
9 changed files with 1372 additions and 1180 deletions

33
Dockerfile_nanonets_ocr Normal file
View File

@@ -0,0 +1,33 @@
# Nanonets-OCR-s Vision Language Model
# Based on Qwen2.5-VL-3B, fine-tuned for document OCR
# ~8-10GB VRAM, outputs structured markdown with semantic tags
#
# Build: docker build -f Dockerfile_nanonets_ocr -t nanonets-ocr .
# Run: docker run --gpus all -p 8000:8000 -v ht-huggingface-cache:/root/.cache/huggingface nanonets-ocr
FROM vllm/vllm-openai:latest
LABEL maintainer="Task Venture Capital GmbH <hello@task.vc>"
LABEL description="Nanonets-OCR-s - Document OCR optimized Vision Language Model"
LABEL org.opencontainers.image.source="https://code.foss.global/host.today/ht-docker-ai"
# Environment configuration
ENV MODEL_NAME="nanonets/Nanonets-OCR-s"
ENV HOST="0.0.0.0"
ENV PORT="8000"
ENV MAX_MODEL_LEN="8192"
ENV GPU_MEMORY_UTILIZATION="0.9"
# Expose OpenAI-compatible API port
EXPOSE 8000
# Health check - vLLM exposes /health endpoint
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \
CMD curl -f http://localhost:8000/health || exit 1
# Start vLLM server with Nanonets-OCR-s model
CMD ["--model", "nanonets/Nanonets-OCR-s", \
"--trust-remote-code", \
"--max-model-len", "8192", \
"--host", "0.0.0.0", \
"--port", "8000"]

View File

@@ -1,5 +1,21 @@
# Changelog
## 2026-01-18 - 1.13.2 - fix(tests)
stabilize OCR extraction tests and manage GPU containers
- Add stopAllGpuContainers() and call it before starting GPU images to free GPU memory.
- Remove PaddleOCR-VL image configs and associated ensure helpers from docker test helper to simplify images list.
- Split invoice/bankstatement tests into two sequential stages: Stage 1 runs Nanonets OCR to produce markdown files, Stage 2 stops Nanonets and runs model extraction from saved markdown (avoids GPU contention).
- Introduce temporary markdown directory handling and cleanup; add stopNanonets() and container running checks in tests.
- Switch bank statement extraction model from qwen3:8b to gpt-oss:20b; add request timeout and improved logging/console output across tests.
- Refactor extractWithConsensus and extraction functions to accept document identifiers, improve error messages and JSON extraction robustness.
## 2026-01-18 - 1.13.1 - fix(image_support_files)
remove PaddleOCR-VL server scripts from image_support_files
- Deleted files: image_support_files/paddleocr_vl_full_server.py (approx. 636 lines) and image_support_files/paddleocr_vl_server.py (approx. 465 lines)
- Cleanup/removal of legacy PaddleOCR-VL FastAPI server implementations — may affect users who relied on these local scripts
## 2026-01-18 - 1.13.0 - feat(tests)
revamp tests and remove legacy Dockerfiles: adopt JSON/consensus workflows, switch MiniCPM model, and delete deprecated Docker/test variants

View File

@@ -1,636 +0,0 @@
#!/usr/bin/env python3
"""
PaddleOCR-VL Full Pipeline API Server (Transformers backend)
Provides REST API for document parsing using:
- PP-DocLayoutV2 for layout detection
- PaddleOCR-VL (transformers) for recognition
- Structured JSON/Markdown output
"""
import os
import io
import re
import base64
import logging
import tempfile
import time
import json
from typing import Optional, List, Union
from pathlib import Path
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from PIL import Image
import torch
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
MODEL_NAME = "PaddlePaddle/PaddleOCR-VL"
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
# Task prompts
TASK_PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR-VL Full Pipeline Server",
description="Document parsing with PP-DocLayoutV2 + PaddleOCR-VL (transformers)",
version="1.0.0"
)
# Global model instances
vl_model = None
vl_processor = None
layout_model = None
def load_vl_model():
"""Load the PaddleOCR-VL model for element recognition"""
global vl_model, vl_processor
if vl_model is not None:
return
logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
from transformers import AutoModelForCausalLM, AutoProcessor
vl_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
if DEVICE == "cuda":
vl_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
else:
vl_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
).eval()
logger.info("PaddleOCR-VL model loaded successfully")
def load_layout_model():
"""Load the LayoutDetection model for layout detection"""
global layout_model
if layout_model is not None:
return
try:
logger.info("Loading LayoutDetection model (PP-DocLayout_plus-L)...")
from paddleocr import LayoutDetection
layout_model = LayoutDetection()
logger.info("LayoutDetection model loaded successfully")
except Exception as e:
logger.warning(f"Could not load LayoutDetection: {e}")
logger.info("Falling back to VL-only mode (no layout detection)")
def recognize_element(image: Image.Image, task: str = "ocr") -> str:
"""Recognize a single element using PaddleOCR-VL"""
load_vl_model()
prompt = TASK_PROMPTS.get(task, TASK_PROMPTS["ocr"])
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
inputs = vl_processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
if DEVICE == "cuda":
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.inference_mode():
outputs = vl_model.generate(
**inputs,
max_new_tokens=4096,
do_sample=False,
use_cache=True
)
response = vl_processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract only the assistant's response content
# The response format is: "User: <prompt>\nAssistant: <content>"
# We want to extract just the content after "Assistant:"
if "Assistant:" in response:
parts = response.split("Assistant:")
if len(parts) > 1:
response = parts[-1].strip()
elif "assistant:" in response.lower():
# Case-insensitive fallback
import re
match = re.split(r'[Aa]ssistant:', response)
if len(match) > 1:
response = match[-1].strip()
return response
def detect_layout(image: Image.Image) -> List[dict]:
"""Detect layout regions in the image"""
load_layout_model()
if layout_model is None:
# No layout model - return a single region covering the whole image
return [{
"type": "text",
"bbox": [0, 0, image.width, image.height],
"score": 1.0
}]
# Save image to temp file
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
image.save(tmp.name, "PNG")
tmp_path = tmp.name
try:
results = layout_model.predict(tmp_path)
regions = []
for res in results:
# LayoutDetection returns boxes in 'boxes' key
for box in res.get("boxes", []):
coord = box.get("coordinate", [0, 0, image.width, image.height])
# Convert numpy floats to regular floats
bbox = [float(c) for c in coord]
regions.append({
"type": box.get("label", "text"),
"bbox": bbox,
"score": float(box.get("score", 1.0))
})
# Sort regions by vertical position (top to bottom)
regions.sort(key=lambda r: r["bbox"][1])
return regions if regions else [{
"type": "text",
"bbox": [0, 0, image.width, image.height],
"score": 1.0
}]
finally:
os.unlink(tmp_path)
def process_document(image: Image.Image) -> dict:
"""Process a document through the full pipeline"""
logger.info(f"Processing document: {image.size}")
# Step 1: Detect layout
regions = detect_layout(image)
logger.info(f"Detected {len(regions)} layout regions")
# Step 2: Recognize each region
blocks = []
for i, region in enumerate(regions):
region_type = region["type"].lower()
bbox = region["bbox"]
# Crop region from image
x1, y1, x2, y2 = [int(c) for c in bbox]
region_image = image.crop((x1, y1, x2, y2))
# Determine task based on region type
if "table" in region_type:
task = "table"
elif "formula" in region_type or "math" in region_type:
task = "formula"
elif "chart" in region_type or "figure" in region_type:
task = "chart"
else:
task = "ocr"
# Recognize the region
try:
content = recognize_element(region_image, task)
blocks.append({
"index": i,
"type": region_type,
"bbox": bbox,
"content": content,
"task": task
})
logger.info(f" Region {i} ({region_type}): {len(content)} chars")
except Exception as e:
logger.error(f" Region {i} error: {e}")
blocks.append({
"index": i,
"type": region_type,
"bbox": bbox,
"content": "",
"error": str(e)
})
return {"blocks": blocks, "image_size": list(image.size)}
def result_to_markdown(result: dict) -> str:
"""Convert result to Markdown format with structural hints for LLM processing.
Adds positional and type-based formatting to help downstream LLMs
understand document structure:
- Tables are marked with **[TABLE]** prefix
- Header zone content (top 15%) is bolded
- Footer zone content (bottom 15%) is separated with horizontal rule
- Titles are formatted as # headers
- Figures/charts are marked with *[Figure: ...]*
"""
lines = []
image_height = result.get("image_size", [0, 1000])[1]
for block in result.get("blocks", []):
block_type = block.get("type", "text").lower()
content = block.get("content", "").strip()
bbox = block.get("bbox", [])
if not content:
continue
# Determine position zone (top 15%, middle, bottom 15%)
y_pos = bbox[1] if bbox and len(bbox) > 1 else 0
y_end = bbox[3] if bbox and len(bbox) > 3 else y_pos
is_header_zone = y_pos < image_height * 0.15
is_footer_zone = y_end > image_height * 0.85
# Format based on type and position
if "table" in block_type:
lines.append(f"\n**[TABLE]**\n{content}\n")
elif "title" in block_type:
lines.append(f"# {content}")
elif "formula" in block_type or "math" in block_type:
lines.append(f"\n$$\n{content}\n$$\n")
elif "figure" in block_type or "chart" in block_type:
lines.append(f"*[Figure: {content}]*")
elif is_header_zone:
lines.append(f"**{content}**")
elif is_footer_zone:
lines.append(f"---\n{content}")
else:
lines.append(content)
return "\n\n".join(lines)
def parse_markdown_table(content: str) -> str:
"""Convert table content to HTML table.
Handles:
- PaddleOCR-VL format: <fcel>cell<lcel>cell<nl> (detected by <fcel> tags)
- Pipe-delimited tables: | Header | Header |
- Separator rows: |---|---|
- Returns HTML <table> structure
"""
content_stripped = content.strip()
# Check for PaddleOCR-VL table format (<fcel>, <lcel>, <ecel>, <nl>)
if '<fcel>' in content_stripped or '<nl>' in content_stripped:
return parse_paddleocr_table(content_stripped)
lines = content_stripped.split('\n')
if not lines:
return f'<pre>{content}</pre>'
# Check if it looks like a markdown table
if not any('|' in line for line in lines):
return f'<pre>{content}</pre>'
html_rows = []
is_header = True
for line in lines:
line = line.strip()
if not line or line.startswith('|') == False and '|' not in line:
continue
# Skip separator rows (|---|---|)
if re.match(r'^[\|\s\-:]+$', line):
is_header = False
continue
# Parse cells
cells = [c.strip() for c in line.split('|')]
cells = [c for c in cells if c] # Remove empty from edges
if is_header:
row = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
html_rows.append(f'<thead>{row}</thead>')
is_header = False
else:
row = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
html_rows.append(row)
if html_rows:
# Wrap body rows in tbody
header = html_rows[0] if '<thead>' in html_rows[0] else ''
body_rows = [r for r in html_rows if '<thead>' not in r]
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
return f'<table>{header}{body}</table>'
return f'<pre>{content}</pre>'
def parse_paddleocr_table(content: str) -> str:
"""Convert PaddleOCR-VL table format to HTML table.
PaddleOCR-VL uses:
- <fcel> = first cell in a row
- <lcel> = subsequent cells
- <ecel> = empty cell
- <nl> = row separator (newline)
Example input:
<fcel>Header1<lcel>Header2<nl><fcel>Value1<lcel>Value2<nl>
"""
# Split into rows by <nl>
rows_raw = re.split(r'<nl>', content)
html_rows = []
is_first_row = True
for row_content in rows_raw:
row_content = row_content.strip()
if not row_content:
continue
# Extract cells: split by <fcel>, <lcel>, or <ecel>
# Each cell is the text between these markers
cells = []
# Pattern to match cell markers and capture content
# Content is everything between markers
parts = re.split(r'<fcel>|<lcel>|<ecel>', row_content)
for part in parts:
part = part.strip()
if part:
cells.append(part)
if not cells:
continue
# First row is header
if is_first_row:
row_html = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
html_rows.append(f'<thead>{row_html}</thead>')
is_first_row = False
else:
row_html = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
html_rows.append(row_html)
if html_rows:
header = html_rows[0] if '<thead>' in html_rows[0] else ''
body_rows = [r for r in html_rows if '<thead>' not in r]
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
return f'<table>{header}{body}</table>'
return f'<pre>{content}</pre>'
def result_to_html(result: dict) -> str:
"""Convert result to semantic HTML for optimal LLM processing.
Uses semantic HTML5 tags with position metadata as data-* attributes.
Markdown tables are converted to proper HTML <table> tags for
unambiguous parsing by downstream LLMs.
"""
parts = []
image_height = result.get("image_size", [0, 1000])[1]
parts.append('<!DOCTYPE html><html><body>')
for block in result.get("blocks", []):
block_type = block.get("type", "text").lower()
content = block.get("content", "").strip()
bbox = block.get("bbox", [])
if not content:
continue
# Position metadata
y_pos = bbox[1] / image_height if bbox and len(bbox) > 1 else 0
data_attrs = f'data-type="{block_type}" data-y="{y_pos:.2f}"'
# Format based on type
if "table" in block_type:
table_html = parse_markdown_table(content)
parts.append(f'<section {data_attrs} class="table-region">{table_html}</section>')
elif "title" in block_type:
parts.append(f'<h1 {data_attrs}>{content}</h1>')
elif "formula" in block_type or "math" in block_type:
parts.append(f'<div {data_attrs} class="formula"><code>{content}</code></div>')
elif "figure" in block_type or "chart" in block_type:
parts.append(f'<figure {data_attrs}><figcaption>{content}</figcaption></figure>')
elif y_pos < 0.15:
parts.append(f'<header {data_attrs}><strong>{content}</strong></header>')
elif y_pos > 0.85:
parts.append(f'<footer {data_attrs}>{content}</footer>')
else:
parts.append(f'<p {data_attrs}>{content}</p>')
parts.append('</body></html>')
return '\n'.join(parts)
# Request/Response models
class ParseRequest(BaseModel):
image: str # base64 encoded image
output_format: Optional[str] = "json"
class ParseResponse(BaseModel):
success: bool
format: str
result: Union[dict, str]
processing_time: float
error: Optional[str] = None
def decode_image(image_source: str) -> Image.Image:
"""Decode image from base64 or data URL"""
if image_source.startswith("data:"):
header, data = image_source.split(",", 1)
image_data = base64.b64decode(data)
else:
image_data = base64.b64decode(image_source)
return Image.open(io.BytesIO(image_data)).convert("RGB")
@app.on_event("startup")
async def startup_event():
"""Pre-load models on startup"""
logger.info("Starting PaddleOCR-VL Full Pipeline Server...")
try:
load_vl_model()
load_layout_model()
logger.info("Models loaded successfully")
except Exception as e:
logger.error(f"Failed to pre-load models: {e}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy" if vl_model is not None else "loading",
"service": "PaddleOCR-VL Full Pipeline (Transformers)",
"device": DEVICE,
"vl_model_loaded": vl_model is not None,
"layout_model_loaded": layout_model is not None
}
@app.get("/formats")
async def supported_formats():
"""List supported output formats"""
return {
"output_formats": ["json", "markdown", "html"],
"image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
"capabilities": [
"Layout detection (PP-DocLayoutV2)",
"Text recognition (OCR)",
"Table recognition",
"Formula recognition (LaTeX)",
"Chart recognition",
"Multi-language support (109 languages)"
]
}
@app.post("/parse", response_model=ParseResponse)
async def parse_document_endpoint(request: ParseRequest):
"""Parse a document image and return structured output"""
try:
start_time = time.time()
image = decode_image(request.image)
result = process_document(image)
if request.output_format == "markdown":
markdown = result_to_markdown(result)
output = {"markdown": markdown}
elif request.output_format == "html":
html = result_to_html(result)
output = {"html": html}
else:
output = result
elapsed = time.time() - start_time
logger.info(f"Processing complete in {elapsed:.2f}s")
return ParseResponse(
success=True,
format=request.output_format,
result=output,
processing_time=elapsed
)
except Exception as e:
logger.error(f"Error processing document: {e}", exc_info=True)
return ParseResponse(
success=False,
format=request.output_format,
result={},
processing_time=0,
error=str(e)
)
@app.post("/v1/chat/completions")
async def chat_completions(request: dict):
"""OpenAI-compatible chat completions endpoint"""
try:
messages = request.get("messages", [])
output_format = request.get("output_format", "json")
# Find user message with image
image = None
for msg in reversed(messages):
if msg.get("role") == "user":
content = msg.get("content", [])
if isinstance(content, list):
for item in content:
if item.get("type") == "image_url":
url = item.get("image_url", {}).get("url", "")
image = decode_image(url)
break
break
if image is None:
raise HTTPException(status_code=400, detail="No image provided")
start_time = time.time()
result = process_document(image)
if output_format == "markdown":
content = result_to_markdown(result)
elif output_format == "html":
content = result_to_html(result)
else:
content = json.dumps(result, ensure_ascii=False, indent=2)
elapsed = time.time() - start_time
return {
"id": f"chatcmpl-{int(time.time()*1000)}",
"object": "chat.completion",
"created": int(time.time()),
"model": "paddleocr-vl-full",
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": content},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 100,
"completion_tokens": len(content) // 4,
"total_tokens": 100 + len(content) // 4
},
"processing_time": elapsed
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in chat completions: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

View File

@@ -1,465 +0,0 @@
#!/usr/bin/env python3
"""
PaddleOCR-VL FastAPI Server (CPU variant)
Provides OpenAI-compatible REST API for document parsing using PaddleOCR-VL
"""
import os
import io
import base64
import logging
import time
from typing import Optional, List, Any, Dict, Union
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch
from PIL import Image
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Environment configuration
SERVER_HOST = os.environ.get('SERVER_HOST', '0.0.0.0')
SERVER_PORT = int(os.environ.get('SERVER_PORT', '8000'))
MODEL_NAME = os.environ.get('MODEL_NAME', 'PaddlePaddle/PaddleOCR-VL')
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
# Task prompts for PaddleOCR-VL
TASK_PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
# Initialize FastAPI app
app = FastAPI(
title="PaddleOCR-VL Server",
description="OpenAI-compatible REST API for document parsing using PaddleOCR-VL",
version="1.0.0"
)
# Global model instances
model = None
processor = None
# Request/Response models (OpenAI-compatible)
class ImageUrl(BaseModel):
url: str
class ContentItem(BaseModel):
type: str
text: Optional[str] = None
image_url: Optional[ImageUrl] = None
class Message(BaseModel):
role: str
content: Union[str, List[ContentItem]]
class ChatCompletionRequest(BaseModel):
model: str = "paddleocr-vl"
messages: List[Message]
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 4096
class Choice(BaseModel):
index: int
message: Message
finish_reason: str
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
usage: Usage
class HealthResponse(BaseModel):
status: str
model: str
device: str
def load_model():
"""Load the PaddleOCR-VL model and processor"""
global model, processor
if model is not None:
return
logger.info(f"Loading PaddleOCR-VL model: {MODEL_NAME}")
from transformers import AutoModelForCausalLM, AutoProcessor
# Load processor
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Load model with appropriate settings for CPU/GPU
if DEVICE == "cuda":
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
else:
# CPU mode - use float32 for compatibility
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
).eval()
logger.info("PaddleOCR-VL model loaded successfully")
def optimize_image_resolution(image: Image.Image, max_size: int = 2048, min_size: int = 1080) -> Image.Image:
"""
Optimize image resolution for PaddleOCR-VL.
Best results are achieved with images in the 1080p-2K range.
- Images larger than max_size are scaled down
- Very small images are scaled up to min_size
"""
width, height = image.size
max_dim = max(width, height)
min_dim = min(width, height)
# Scale down if too large (4K+ images often miss text)
if max_dim > max_size:
scale = max_size / max_dim
new_width = int(width * scale)
new_height = int(height * scale)
logger.info(f"Scaling down image from {width}x{height} to {new_width}x{new_height}")
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Scale up if too small
elif max_dim < min_size and min_dim < min_size:
scale = min_size / max_dim
new_width = int(width * scale)
new_height = int(height * scale)
logger.info(f"Scaling up image from {width}x{height} to {new_width}x{new_height}")
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
else:
logger.info(f"Image size {width}x{height} is optimal, no scaling needed")
return image
def decode_image(image_source: str, optimize: bool = True) -> Image.Image:
"""
Decode image from various sources.
Supported formats:
- Base64 data URL: data:image/png;base64,... or data:image/jpeg;base64,...
- HTTP/HTTPS URL: https://example.com/image.png
- Raw base64 string
- Local file path
Supported image types: PNG, JPEG, WebP, BMP, GIF, TIFF
"""
image = None
if image_source.startswith("data:"):
# Base64 encoded image with MIME type header
# Supports: data:image/png;base64,... data:image/jpeg;base64,... etc.
header, data = image_source.split(",", 1)
image_data = base64.b64decode(data)
image = Image.open(io.BytesIO(image_data)).convert("RGB")
logger.debug(f"Decoded base64 image with header: {header}")
elif image_source.startswith("http://") or image_source.startswith("https://"):
# URL - fetch image
import httpx
response = httpx.get(image_source, timeout=30.0)
response.raise_for_status()
image = Image.open(io.BytesIO(response.content)).convert("RGB")
logger.debug(f"Fetched image from URL: {image_source[:50]}...")
else:
# Assume it's a file path or raw base64
try:
image_data = base64.b64decode(image_source)
image = Image.open(io.BytesIO(image_data)).convert("RGB")
logger.debug("Decoded raw base64 image")
except:
# Try as file path
image = Image.open(image_source).convert("RGB")
logger.debug(f"Loaded image from file: {image_source}")
# Optimize resolution for best OCR results
if optimize:
image = optimize_image_resolution(image)
return image
def extract_image_and_text(content: Union[str, List[ContentItem]]) -> tuple:
"""Extract image and text prompt from message content"""
if isinstance(content, str):
return None, content
image = None
text = ""
for item in content:
if item.type == "image_url" and item.image_url:
image = decode_image(item.image_url.url)
elif item.type == "text" and item.text:
text = item.text
return image, text
def generate_response(image: Image.Image, prompt: str, max_tokens: int = 4096) -> str:
"""Generate response using PaddleOCR-VL"""
load_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
if DEVICE == "cuda":
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
use_cache=True
)
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract the assistant's response (after the prompt)
if "assistant" in response.lower():
parts = response.split("assistant")
if len(parts) > 1:
response = parts[-1].strip()
return response
@app.on_event("startup")
async def startup_event():
"""Pre-load the model on startup"""
logger.info("Pre-loading PaddleOCR-VL model...")
try:
load_model()
logger.info("Model pre-loaded successfully")
except Exception as e:
logger.error(f"Failed to pre-load model: {e}")
# Don't fail startup - model will be loaded on first request
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
return HealthResponse(
status="healthy" if model is not None else "loading",
model=MODEL_NAME,
device=DEVICE
)
@app.get("/formats")
async def supported_formats():
"""List supported image formats and input methods"""
return {
"image_formats": {
"supported": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
"recommended": ["PNG", "JPEG"],
"mime_types": [
"image/png",
"image/jpeg",
"image/webp",
"image/bmp",
"image/gif",
"image/tiff"
]
},
"input_methods": {
"base64_data_url": {
"description": "Base64 encoded image with MIME type header",
"example": "data:image/png;base64,iVBORw0KGgo..."
},
"http_url": {
"description": "Direct HTTP/HTTPS URL to image",
"example": "https://example.com/image.png"
},
"raw_base64": {
"description": "Raw base64 string without header",
"example": "iVBORw0KGgo..."
}
},
"resolution": {
"optimal_range": "1080p to 2K (1080-2048 pixels on longest side)",
"auto_scaling": True,
"note": "Images are automatically scaled to optimal range. 4K+ images are scaled down for better accuracy."
},
"task_prompts": TASK_PROMPTS
}
@app.get("/v1/models")
async def list_models():
"""List available models (OpenAI-compatible)"""
return {
"object": "list",
"data": [
{
"id": "paddleocr-vl",
"object": "model",
"created": int(time.time()),
"owned_by": "paddlepaddle"
}
]
}
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
"""
OpenAI-compatible chat completions endpoint for PaddleOCR-VL
Supports tasks:
- "OCR:" - Text recognition
- "Table Recognition:" - Table extraction
- "Formula Recognition:" - Formula extraction
- "Chart Recognition:" - Chart extraction
"""
try:
# Get the last user message
user_message = None
for msg in reversed(request.messages):
if msg.role == "user":
user_message = msg
break
if not user_message:
raise HTTPException(status_code=400, detail="No user message found")
# Extract image and prompt
image, prompt = extract_image_and_text(user_message.content)
if image is None:
raise HTTPException(status_code=400, detail="No image provided in message")
# Default to OCR if no specific prompt
if not prompt or prompt.strip() == "":
prompt = "OCR:"
logger.info(f"Processing request with prompt: {prompt[:50]}...")
# Generate response
start_time = time.time()
response_text = generate_response(image, prompt, request.max_tokens or 4096)
elapsed = time.time() - start_time
logger.info(f"Generated response in {elapsed:.2f}s ({len(response_text)} chars)")
# Build OpenAI-compatible response
return ChatCompletionResponse(
id=f"chatcmpl-{int(time.time()*1000)}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=100, # Approximate
completion_tokens=len(response_text) // 4,
total_tokens=100 + len(response_text) // 4
)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error processing request: {e}")
raise HTTPException(status_code=500, detail=str(e))
# Legacy endpoint for compatibility with old PaddleOCR API
class LegacyOCRRequest(BaseModel):
image: str
task: Optional[str] = "ocr"
class LegacyOCRResponse(BaseModel):
success: bool
result: str
task: str
error: Optional[str] = None
@app.post("/ocr", response_model=LegacyOCRResponse)
async def legacy_ocr(request: LegacyOCRRequest):
"""
Legacy OCR endpoint for backwards compatibility
Tasks: ocr, table, formula, chart
"""
try:
image = decode_image(request.image)
prompt = TASK_PROMPTS.get(request.task, TASK_PROMPTS["ocr"])
result = generate_response(image, prompt)
return LegacyOCRResponse(
success=True,
result=result,
task=request.task
)
except Exception as e:
logger.error(f"Legacy OCR error: {e}")
return LegacyOCRResponse(
success=False,
result="",
task=request.task,
error=str(e)
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

View File

@@ -1,6 +1,6 @@
{
"name": "@host.today/ht-docker-ai",
"version": "1.13.0",
"version": "1.13.2",
"type": "module",
"private": false,
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",

View File

@@ -244,8 +244,97 @@ The bank statement extraction uses a dual-VLM consensus approach:
---
## Nanonets-OCR-s
### Overview
Nanonets-OCR-s is a Qwen2.5-VL-3B model fine-tuned specifically for document OCR tasks. It outputs structured markdown with semantic tags.
**Key features:**
- Based on Qwen2.5-VL-3B (~4B parameters)
- Fine-tuned for document OCR
- Outputs markdown with semantic HTML tags
- ~8-10GB VRAM (fits comfortably in 16GB)
### Docker Images
| Tag | Description |
|-----|-------------|
| `nanonets-ocr` | GPU variant using vLLM (OpenAI-compatible API) |
### API Endpoints (OpenAI-compatible via vLLM)
| Endpoint | Method | Description |
|----------|--------|-------------|
| `/health` | GET | Health check |
| `/v1/models` | GET | List available models |
| `/v1/chat/completions` | POST | OpenAI-compatible chat completions |
### Request/Response Format
**POST /v1/chat/completions (OpenAI-compatible)**
```json
{
"model": "nanonets/Nanonets-OCR-s",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
{"type": "text", "text": "Extract the text from the above document..."}
]
}
],
"temperature": 0.0,
"max_tokens": 4096
}
```
### Nanonets OCR Prompt
The model is designed to work with a specific prompt format:
```
Extract the text from the above document as if you were reading it naturally.
Return the tables in html format.
Return the equations in LaTeX representation.
If there is an image in the document and image caption is not present, add a small description inside <img></img> tag.
Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>.
Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number>.
```
### Performance
- **GPU (vLLM)**: ~3-8 seconds per page
- **VRAM usage**: ~8-10GB
### Two-Stage Pipeline (Nanonets + Qwen3)
The Nanonets tests use a two-stage pipeline:
1. **Stage 1**: Nanonets-OCR-s converts images to markdown (via vLLM on port 8000)
2. **Stage 2**: Qwen3 8B extracts structured JSON from markdown (via Ollama on port 11434)
**GPU Limitation**: Both vLLM and Ollama require significant GPU memory. On a single GPU system:
- Running both simultaneously causes memory contention
- For single GPU: Run services sequentially (stop Nanonets before Qwen3)
- For multi-GPU: Assign each service to a different GPU
**Sequential Execution**:
```bash
# Step 1: Run Nanonets OCR (converts to markdown)
docker start nanonets-test
# ... perform OCR ...
docker stop nanonets-test
# Step 2: Run Qwen3 extraction (from markdown)
docker start minicpm-test
# ... extract JSON ...
```
---
## Related Resources
- [Ollama Documentation](https://ollama.ai/docs)
- [MiniCPM-V GitHub](https://github.com/OpenBMB/MiniCPM-V)
- [Ollama API Reference](https://github.com/ollama/ollama/blob/main/docs/api.md)
- [Nanonets-OCR-s on HuggingFace](https://huggingface.co/nanonets/Nanonets-OCR-s)

View File

@@ -2,11 +2,8 @@ import { execSync } from 'child_process';
// Project container names (only manage these)
const PROJECT_CONTAINERS = [
'paddleocr-vl-test',
'paddleocr-vl-gpu-test',
'paddleocr-vl-cpu-test',
'paddleocr-vl-full-test',
'minicpm-test',
'nanonets-test',
];
// Image configurations
@@ -23,30 +20,6 @@ export interface IImageConfig {
}
export const IMAGES = {
paddleocrVlGpu: {
name: 'paddleocr-vl-gpu',
dockerfile: 'Dockerfile_paddleocr_vl_gpu',
buildContext: '.',
containerName: 'paddleocr-vl-test',
ports: ['8000:8000'],
volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
gpus: true,
healthEndpoint: 'http://localhost:8000/health',
healthTimeout: 300000, // 5 minutes for model loading
} as IImageConfig,
paddleocrVlCpu: {
name: 'paddleocr-vl-cpu',
dockerfile: 'Dockerfile_paddleocr_vl_cpu',
buildContext: '.',
containerName: 'paddleocr-vl-test',
ports: ['8000:8000'],
volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
gpus: false,
healthEndpoint: 'http://localhost:8000/health',
healthTimeout: 300000,
} as IImageConfig,
minicpm: {
name: 'minicpm45v',
dockerfile: 'Dockerfile_minicpm45v_gpu',
@@ -59,20 +32,17 @@ export const IMAGES = {
healthTimeout: 120000,
} as IImageConfig,
// Full PaddleOCR-VL pipeline with PP-DocLayoutV2 + structured JSON output
paddleocrVlFull: {
name: 'paddleocr-vl-full',
dockerfile: 'Dockerfile_paddleocr_vl_full',
// Nanonets-OCR-s - Document OCR optimized VLM (Qwen2.5-VL-3B fine-tuned)
nanonetsOcr: {
name: 'nanonets-ocr',
dockerfile: 'Dockerfile_nanonets_ocr',
buildContext: '.',
containerName: 'paddleocr-vl-full-test',
containerName: 'nanonets-test',
ports: ['8000:8000'],
volumes: [
'ht-huggingface-cache:/root/.cache/huggingface',
'ht-paddleocr-cache:/root/.paddleocr',
],
volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
gpus: true,
healthEndpoint: 'http://localhost:8000/health',
healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2)
healthTimeout: 300000, // 5 minutes for model loading
} as IImageConfig,
};
@@ -126,7 +96,7 @@ export function removeContainer(containerName: string): void {
}
/**
* Stop all project containers that conflict with the required one
* Stop all project containers that conflict with the required one (port-based)
*/
export function stopConflictingContainers(requiredContainer: string, requiredPort: string): void {
// Stop project containers using the same port
@@ -144,6 +114,24 @@ export function stopConflictingContainers(requiredContainer: string, requiredPor
}
}
/**
* Stop all GPU-consuming project containers (for GPU memory management)
* This ensures GPU memory is freed before starting a new GPU service
*/
export function stopAllGpuContainers(exceptContainer?: string): void {
for (const container of PROJECT_CONTAINERS) {
if (container === exceptContainer) continue;
if (isContainerRunning(container)) {
console.log(`[Docker] Stopping GPU container: ${container}`);
exec(`docker stop ${container}`, true);
// Give the GPU a moment to free memory
}
}
// Brief pause to allow GPU memory to be released
execSync('sleep 2');
}
/**
* Build a Docker image
*/
@@ -220,6 +208,11 @@ export async function ensureService(config: IImageConfig): Promise<boolean> {
buildImage(config);
}
// For GPU services, stop ALL other GPU containers to free GPU memory
if (config.gpus) {
stopAllGpuContainers(config.containerName);
}
// Stop conflicting containers on the same port
const mainPort = config.ports[0];
stopConflictingContainers(config.containerName, mainPort);
@@ -240,21 +233,7 @@ export async function ensureService(config: IImageConfig): Promise<boolean> {
}
/**
* Ensure PaddleOCR-VL GPU service is running
*/
export async function ensurePaddleOcrVlGpu(): Promise<boolean> {
return ensureService(IMAGES.paddleocrVlGpu);
}
/**
* Ensure PaddleOCR-VL CPU service is running
*/
export async function ensurePaddleOcrVlCpu(): Promise<boolean> {
return ensureService(IMAGES.paddleocrVlCpu);
}
/**
* Ensure MiniCPM service is running
* Ensure MiniCPM service is running (Ollama with GPU)
*/
export async function ensureMiniCpm(): Promise<boolean> {
return ensureService(IMAGES.minicpm);
@@ -272,30 +251,6 @@ export function isGpuAvailable(): boolean {
}
}
/**
* Ensure PaddleOCR-VL service (auto-detect GPU/CPU)
*/
export async function ensurePaddleOcrVl(): Promise<boolean> {
if (isGpuAvailable()) {
console.log('[Docker] GPU detected, using GPU image');
return ensurePaddleOcrVlGpu();
} else {
console.log('[Docker] No GPU detected, using CPU image');
return ensurePaddleOcrVlCpu();
}
}
/**
* Ensure PaddleOCR-VL Full Pipeline service (PP-DocLayoutV2 + structured output)
* This is the recommended service for production use - outputs structured JSON/Markdown
*/
export async function ensurePaddleOcrVlFull(): Promise<boolean> {
if (!isGpuAvailable()) {
console.log('[Docker] WARNING: Full pipeline requires GPU, but none detected');
}
return ensureService(IMAGES.paddleocrVlFull);
}
/**
* Ensure an Ollama model is pulled and available
* Uses the MiniCPM container (which runs Ollama) to pull the model
@@ -383,3 +338,14 @@ export async function ensureQwen3Vl(): Promise<boolean> {
// Then ensure Qwen3-VL 8B is pulled
return ensureOllamaModel('qwen3-vl:8b');
}
/**
* Ensure Nanonets-OCR-s service is running (via vLLM)
* Document OCR optimized VLM based on Qwen2.5-VL-3B
*/
export async function ensureNanonetsOcr(): Promise<boolean> {
if (!isGpuAvailable()) {
console.log('[Docker] WARNING: Nanonets-OCR-s requires GPU, but none detected');
}
return ensureService(IMAGES.nanonetsOcr);
}

View File

@@ -0,0 +1,585 @@
/**
* Bank statement extraction using Nanonets-OCR-s + GPT-OSS 20B (sequential two-stage pipeline)
*
* Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion)
* Stage 2: GPT-OSS 20B extracts structured JSON from saved markdown (after Nanonets stops)
*
* This approach avoids GPU contention by running services sequentially.
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensureNanonetsOcr, ensureMiniCpm, removeContainer, isContainerRunning } from './helpers/docker.js';
const NANONETS_URL = 'http://localhost:8000/v1';
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s';
const OLLAMA_URL = 'http://localhost:11434';
const EXTRACTION_MODEL = 'gpt-oss:20b';
// Temp directory for storing markdown between stages
const TEMP_MD_DIR = path.join(os.tmpdir(), 'nanonets-markdown');
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
interface ITestCase {
name: string;
pdfPath: string;
jsonPath: string;
markdownPath?: string;
images?: string[];
}
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally.
Return the tables in html format.
Return the equations in LaTeX representation.
If there is an image in the document and image caption is not present, add a small description inside <img></img> tag.
Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>.
Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number>.`;
// JSON extraction prompt for GPT-OSS 20B
const JSON_EXTRACTION_PROMPT = `Extract ALL transactions from this bank statement as JSON array. Each transaction: {"date": "YYYY-MM-DD", "counterparty": "NAME", "amount": -25.99}. Amount negative for debits, positive for credits. Only include actual transactions, not balances. Return ONLY JSON array, no explanation.
STATEMENT:
`;
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Convert a single page to markdown using Nanonets-OCR-s
*/
async function convertPageToMarkdown(image: string, pageNum: number): Promise<string> {
const startTime = Date.now();
const response = await fetch(`${NANONETS_URL}/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer dummy',
},
body: JSON.stringify({
model: NANONETS_MODEL,
messages: [{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: `data:image/png;base64,${image}` }},
{ type: 'text', text: NANONETS_OCR_PROMPT },
],
}],
max_tokens: 4096,
temperature: 0.0,
}),
});
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Nanonets API error: ${response.status} - ${errorText}`);
}
const data = await response.json();
const content = (data.choices?.[0]?.message?.content || '').trim();
console.log(` Page ${pageNum}: ${content.length} chars (${elapsed}s)`);
return content;
}
/**
* Convert all pages of a document to markdown
*/
async function convertDocumentToMarkdown(images: string[], docName: string): Promise<string> {
console.log(` [${docName}] Converting ${images.length} page(s)...`);
const markdownPages: string[] = [];
for (let i = 0; i < images.length; i++) {
const markdown = await convertPageToMarkdown(images[i], i + 1);
markdownPages.push(`--- PAGE ${i + 1} ---\n${markdown}`);
}
const fullMarkdown = markdownPages.join('\n\n');
console.log(` [${docName}] Complete: ${fullMarkdown.length} chars total`);
return fullMarkdown;
}
/**
* Stop Nanonets container
*/
function stopNanonets(): void {
console.log(' [Docker] Stopping Nanonets container...');
try {
execSync('docker stop nanonets-test 2>/dev/null || true', { stdio: 'pipe' });
// Wait for GPU memory to be released
execSync('sleep 5', { stdio: 'pipe' });
console.log(' [Docker] Nanonets stopped');
} catch {
console.log(' [Docker] Nanonets was not running');
}
}
/**
* Ensure GPT-OSS 20B model is available and warmed up
*/
async function ensureExtractionModel(): Promise<boolean> {
try {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
if (response.ok) {
const data = await response.json();
const models = data.models || [];
if (models.some((m: { name: string }) => m.name === EXTRACTION_MODEL)) {
console.log(` [Ollama] Model available: ${EXTRACTION_MODEL}`);
// Warmup: send a simple request to ensure model is loaded
console.log(` [Ollama] Warming up model...`);
const warmupResponse = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: EXTRACTION_MODEL,
messages: [{ role: 'user', content: 'Return: [{"test": 1}]' }],
stream: false,
}),
signal: AbortSignal.timeout(120000),
});
if (warmupResponse.ok) {
const warmupData = await warmupResponse.json();
console.log(` [Ollama] Warmup complete (${warmupData.message?.content?.length || 0} chars)`);
}
return true;
}
}
} catch {
return false;
}
console.log(` [Ollama] Pulling ${EXTRACTION_MODEL}...`);
const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: EXTRACTION_MODEL, stream: false }),
});
return pullResponse.ok;
}
/**
* Extract transactions from markdown using GPT-OSS 20B (streaming)
*/
async function extractTransactionsFromMarkdown(markdown: string, queryId: string): Promise<ITransaction[]> {
console.log(` [${queryId}] Sending to ${EXTRACTION_MODEL}...`);
console.log(` [${queryId}] Markdown length: ${markdown.length}`);
const startTime = Date.now();
const fullPrompt = JSON_EXTRACTION_PROMPT + markdown;
console.log(` [${queryId}] Prompt preview: ${fullPrompt.substring(0, 200)}...`);
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: EXTRACTION_MODEL,
messages: [{
role: 'user',
content: fullPrompt,
}],
stream: true,
}),
signal: AbortSignal.timeout(600000), // 10 minute timeout
});
if (!response.ok) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`);
throw new Error(`Ollama API error: ${response.status}`);
}
// Stream the response and log to console
let content = '';
const reader = response.body!.getReader();
const decoder = new TextDecoder();
process.stdout.write(` [${queryId}] `);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
// Each line is a JSON object
for (const line of chunk.split('\n').filter(l => l.trim())) {
try {
const json = JSON.parse(line);
const token = json.message?.content || '';
if (token) {
process.stdout.write(token);
content += token;
}
} catch {
// Ignore parse errors for partial chunks
}
}
}
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`\n [${queryId}] Done: ${content.length} chars (${elapsed}s)`);
return parseJsonResponse(content, queryId);
}
/**
* Sanitize JSON string
*/
function sanitizeJson(jsonStr: string): string {
let s = jsonStr;
s = s.replace(/"amount"\s*:\s*\+/g, '"amount": ');
s = s.replace(/:\s*\+(\d)/g, ': $1');
s = s.replace(/"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g, '"amount": $1$2$3.$4');
s = s.replace(/,\s*([}\]])/g, '$1');
s = s.replace(/"([^"\\]*)\n([^"]*)"/g, '"$1 $2"');
s = s.replace(/"([^"\\]*)\t([^"]*)"/g, '"$1 $2"');
s = s.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, ' ');
return s;
}
/**
* Parse amount from various formats
*/
function parseAmount(value: unknown): number {
if (typeof value === 'number') return value;
if (typeof value !== 'string') return 0;
let s = value.replace(/[€$£\s]/g, '').replace('', '-').replace('', '-');
if (s.includes(',') && s.indexOf(',') > s.lastIndexOf('.')) {
s = s.replace(/\./g, '').replace(',', '.');
} else {
s = s.replace(/,/g, '');
}
return parseFloat(s) || 0;
}
/**
* Parse JSON response into transactions
*/
function parseJsonResponse(response: string, queryId: string): ITransaction[] {
// Remove thinking tags if present
let cleanResponse = response.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
// Debug: show what we're working with
console.log(` [${queryId}] Response preview: ${cleanResponse.substring(0, 300)}...`);
const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/);
let jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse;
jsonStr = sanitizeJson(jsonStr);
try {
const parsed = JSON.parse(jsonStr);
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions`);
return txs;
}
} catch (e) {
// Try to find a JSON array in the text
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
if (arrayMatch) {
console.log(` [${queryId}] Array match found: ${arrayMatch[0].length} chars`);
try {
const parsed = JSON.parse(sanitizeJson(arrayMatch[0]));
if (Array.isArray(parsed)) {
const txs = parsed.map(tx => ({
date: String(tx.date || ''),
counterparty: String(tx.counterparty || tx.description || ''),
amount: parseAmount(tx.amount),
}));
console.log(` [${queryId}] Parsed ${txs.length} transactions (array match)`);
return txs;
}
} catch (innerErr) {
console.log(` [${queryId}] Array parse error: ${(innerErr as Error).message}`);
}
} else {
console.log(` [${queryId}] No JSON array found in response`);
}
}
console.log(` [${queryId}] PARSE FAILED`);
return [];
}
/**
* Extract transactions (single pass)
*/
async function extractTransactions(markdown: string, docName: string): Promise<ITransaction[]> {
console.log(` [${docName}] Extracting...`);
const txs = await extractTransactionsFromMarkdown(markdown, docName);
console.log(` [${docName}] Extracted ${txs.length} transactions`);
return txs;
}
/**
* Compare transactions
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[] } {
const errors: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing tx ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
} else {
errors.push(`Mismatch ${i}: exp ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors };
}
/**
* Find all test cases
*/
function findTestCases(): ITestCase[] {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) return [];
const files = fs.readdirSync(testDir);
const testCases: ITestCase[] = [];
for (const pdf of files.filter((f: string) => f.endsWith('.pdf'))) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases.sort((a, b) => a.name.localeCompare(b.name));
}
// ============ TESTS ============
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} bank statement test cases\n`);
// Ensure temp directory exists
if (!fs.existsSync(TEMP_MD_DIR)) {
fs.mkdirSync(TEMP_MD_DIR, { recursive: true });
}
// -------- STAGE 1: OCR with Nanonets --------
// Check if all markdown files already exist
function allMarkdownFilesExist(): boolean {
for (const tc of testCases) {
const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
if (!fs.existsSync(mdPath)) {
return false;
}
}
return true;
}
// Track whether we need to run Stage 1
let stage1Needed = !allMarkdownFilesExist();
tap.test('Stage 1: Setup Nanonets', async () => {
console.log('\n========== STAGE 1: Nanonets OCR ==========\n');
if (!stage1Needed) {
console.log(' [SKIP] All markdown files already exist, skipping Nanonets setup');
return;
}
const ok = await ensureNanonetsOcr();
expect(ok).toBeTrue();
});
tap.test('Stage 1: Convert all documents to markdown', async () => {
if (!stage1Needed) {
console.log(' [SKIP] Using existing markdown files from previous run\n');
// Load existing markdown paths
for (const tc of testCases) {
tc.markdownPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
console.log(` Loaded: ${tc.markdownPath}`);
}
return;
}
console.log('\n Converting all PDFs to markdown with Nanonets-OCR-s...\n');
for (const tc of testCases) {
console.log(`\n === ${tc.name} ===`);
// Convert PDF to images
const images = convertPdfToImages(tc.pdfPath);
console.log(` Pages: ${images.length}`);
// Convert to markdown
const markdown = await convertDocumentToMarkdown(images, tc.name);
// Save markdown to temp file
const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
fs.writeFileSync(mdPath, markdown);
tc.markdownPath = mdPath;
console.log(` Saved: ${mdPath}`);
}
console.log('\n Stage 1 complete: All documents converted to markdown\n');
});
tap.test('Stage 1: Stop Nanonets', async () => {
if (!stage1Needed) {
console.log(' [SKIP] Nanonets was not started');
return;
}
stopNanonets();
// Verify it's stopped
await new Promise(resolve => setTimeout(resolve, 3000));
expect(isContainerRunning('nanonets-test')).toBeFalse();
});
// -------- STAGE 2: Extraction with GPT-OSS 20B --------
tap.test('Stage 2: Setup Ollama + GPT-OSS 20B', async () => {
console.log('\n========== STAGE 2: GPT-OSS 20B Extraction ==========\n');
const ollamaOk = await ensureMiniCpm();
expect(ollamaOk).toBeTrue();
const extractionOk = await ensureExtractionModel();
expect(extractionOk).toBeTrue();
});
let passedCount = 0;
let failedCount = 0;
for (const tc of testCases) {
tap.test(`Stage 2: Extract ${tc.name}`, async () => {
const expected: ITransaction[] = JSON.parse(fs.readFileSync(tc.jsonPath, 'utf-8'));
console.log(`\n === ${tc.name} ===`);
console.log(` Expected: ${expected.length} transactions`);
// Load saved markdown
const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
if (!fs.existsSync(mdPath)) {
throw new Error(`Markdown not found: ${mdPath}. Run Stage 1 first.`);
}
const markdown = fs.readFileSync(mdPath, 'utf-8');
console.log(` Markdown: ${markdown.length} chars`);
// Extract transactions (single pass)
const extracted = await extractTransactions(markdown, tc.name);
// Log results
console.log(` Extracted: ${extracted.length} transactions`);
for (let i = 0; i < Math.min(extracted.length, 5); i++) {
const tx = extracted[i];
console.log(` ${i + 1}. ${tx.date} | ${tx.counterparty.substring(0, 25).padEnd(25)} | ${tx.amount >= 0 ? '+' : ''}${tx.amount.toFixed(2)}`);
}
if (extracted.length > 5) {
console.log(` ... and ${extracted.length - 5} more`);
}
// Compare
const result = compareTransactions(extracted, expected);
const pass = result.matches === result.total && extracted.length === expected.length;
if (pass) {
passedCount++;
console.log(` Result: PASS (${result.matches}/${result.total})`);
} else {
failedCount++;
console.log(` Result: FAIL (${result.matches}/${result.total})`);
result.errors.slice(0, 5).forEach(e => console.log(` - ${e}`));
}
expect(result.matches).toEqual(result.total);
expect(extracted.length).toEqual(expected.length);
});
}
tap.test('Summary', async () => {
console.log(`\n======================================================`);
console.log(` Bank Statement Summary (Nanonets + GPT-OSS 20B Sequential)`);
console.log(`======================================================`);
console.log(` Stage 1: Nanonets-OCR-s (document -> markdown)`);
console.log(` Stage 2: GPT-OSS 20B (markdown -> JSON)`);
console.log(` Passed: ${passedCount}/${testCases.length}`);
console.log(` Failed: ${failedCount}/${testCases.length}`);
console.log(`======================================================\n`);
// Only cleanup temp files if ALL tests passed
if (failedCount === 0 && passedCount === testCases.length) {
try {
fs.rmSync(TEMP_MD_DIR, { recursive: true, force: true });
console.log(` Cleaned up temp directory: ${TEMP_MD_DIR}\n`);
} catch {
// Ignore
}
} else {
console.log(` Keeping temp directory for debugging: ${TEMP_MD_DIR}\n`);
}
});
export default tap.start();

View File

@@ -0,0 +1,604 @@
/**
* Invoice extraction using Nanonets-OCR-s + Qwen3 (sequential two-stage pipeline)
*
* Stage 1: Nanonets-OCR-s converts ALL document pages to markdown (stop after completion)
* Stage 2: Qwen3 extracts structured JSON from saved markdown (after Nanonets stops)
*
* This approach avoids GPU contention by running services sequentially.
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensureNanonetsOcr, ensureMiniCpm, isContainerRunning } from './helpers/docker.js';
const NANONETS_URL = 'http://localhost:8000/v1';
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s';
const OLLAMA_URL = 'http://localhost:11434';
const QWEN_MODEL = 'qwen3:8b';
// Temp directory for storing markdown between stages
const TEMP_MD_DIR = path.join(os.tmpdir(), 'nanonets-invoices-markdown');
interface IInvoice {
invoice_number: string;
invoice_date: string;
vendor_name: string;
currency: string;
net_amount: number;
vat_amount: number;
total_amount: number;
}
interface ITestCase {
name: string;
pdfPath: string;
jsonPath: string;
markdownPath?: string;
}
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = `Extract the text from the above document as if you were reading it naturally.
Return the tables in html format.
Return the equations in LaTeX representation.
If there is an image in the document and image caption is not present, add a small description inside <img></img> tag.
Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>.
Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number>.`;
// JSON extraction prompt for Qwen3
const JSON_EXTRACTION_PROMPT = `You are an invoice data extractor. Below is an invoice document converted to text/markdown. Extract the key invoice fields as JSON.
IMPORTANT RULES:
1. invoice_number: The unique invoice/document number (NOT VAT ID, NOT customer ID)
2. invoice_date: Format as YYYY-MM-DD
3. vendor_name: The company that issued the invoice
4. currency: EUR, USD, or GBP
5. net_amount: Amount before tax
6. vat_amount: Tax/VAT amount
7. total_amount: Final total (gross amount)
Return ONLY this JSON format, no explanation:
{
"invoice_number": "INV-2024-001",
"invoice_date": "2024-01-15",
"vendor_name": "Company Name",
"currency": "EUR",
"net_amount": 100.00,
"vat_amount": 19.00,
"total_amount": 119.00
}
INVOICE TEXT:
`;
/**
* Convert PDF to PNG images
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 150 -quality 90 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Convert a single page to markdown using Nanonets-OCR-s
*/
async function convertPageToMarkdown(image: string, pageNum: number): Promise<string> {
const startTime = Date.now();
const response = await fetch(`${NANONETS_URL}/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer dummy',
},
body: JSON.stringify({
model: NANONETS_MODEL,
messages: [{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: `data:image/png;base64,${image}` }},
{ type: 'text', text: NANONETS_OCR_PROMPT },
],
}],
max_tokens: 4096,
temperature: 0.0,
}),
});
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Nanonets API error: ${response.status} - ${errorText}`);
}
const data = await response.json();
const content = (data.choices?.[0]?.message?.content || '').trim();
console.log(` Page ${pageNum}: ${content.length} chars (${elapsed}s)`);
return content;
}
/**
* Convert all pages of a document to markdown
*/
async function convertDocumentToMarkdown(images: string[], docName: string): Promise<string> {
console.log(` [${docName}] Converting ${images.length} page(s)...`);
const markdownPages: string[] = [];
for (let i = 0; i < images.length; i++) {
const markdown = await convertPageToMarkdown(images[i], i + 1);
markdownPages.push(`--- PAGE ${i + 1} ---\n${markdown}`);
}
const fullMarkdown = markdownPages.join('\n\n');
console.log(` [${docName}] Complete: ${fullMarkdown.length} chars total`);
return fullMarkdown;
}
/**
* Stop Nanonets container
*/
function stopNanonets(): void {
console.log(' [Docker] Stopping Nanonets container...');
try {
execSync('docker stop nanonets-test 2>/dev/null || true', { stdio: 'pipe' });
execSync('sleep 5', { stdio: 'pipe' });
console.log(' [Docker] Nanonets stopped');
} catch {
console.log(' [Docker] Nanonets was not running');
}
}
/**
* Ensure Qwen3 model is available
*/
async function ensureQwen3(): Promise<boolean> {
try {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
if (response.ok) {
const data = await response.json();
const models = data.models || [];
if (models.some((m: { name: string }) => m.name === QWEN_MODEL)) {
console.log(` [Ollama] Model available: ${QWEN_MODEL}`);
return true;
}
}
} catch {
return false;
}
console.log(` [Ollama] Pulling ${QWEN_MODEL}...`);
const pullResponse = await fetch(`${OLLAMA_URL}/api/pull`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: QWEN_MODEL, stream: false }),
});
return pullResponse.ok;
}
/**
* Parse amount from string (handles European format)
*/
function parseAmount(s: string | number | undefined): number {
if (s === undefined || s === null) return 0;
if (typeof s === 'number') return s;
const match = s.match(/([\d.,]+)/);
if (!match) return 0;
const numStr = match[1];
const normalized = numStr.includes(',') && numStr.indexOf(',') > numStr.lastIndexOf('.')
? numStr.replace(/\./g, '').replace(',', '.')
: numStr.replace(/,/g, '');
return parseFloat(normalized) || 0;
}
/**
* Extract invoice number from potentially verbose response
*/
function extractInvoiceNumber(s: string | undefined): string {
if (!s) return '';
let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim();
const patterns = [
/\b([A-Z]{2,3}\d{10,})\b/i,
/\b([A-Z]\d{8,})\b/i,
/\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i,
/\b(\d{7,})\b/,
];
for (const pattern of patterns) {
const match = clean.match(pattern);
if (match) return match[1];
}
return clean.replace(/[^A-Z0-9-]/gi, '').trim() || clean;
}
/**
* Extract date (YYYY-MM-DD) from response
*/
function extractDate(s: string | undefined): string {
if (!s) return '';
let clean = s.replace(/\*\*/g, '').replace(/`/g, '').trim();
const isoMatch = clean.match(/(\d{4}-\d{2}-\d{2})/);
if (isoMatch) return isoMatch[1];
const dmyMatch = clean.match(/(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/);
if (dmyMatch) {
return `${dmyMatch[3]}-${dmyMatch[2].padStart(2, '0')}-${dmyMatch[1].padStart(2, '0')}`;
}
return clean.replace(/[^\d-]/g, '').trim();
}
/**
* Extract currency
*/
function extractCurrency(s: string | undefined): string {
if (!s) return 'EUR';
const upper = s.toUpperCase();
if (upper.includes('EUR') || upper.includes('€')) return 'EUR';
if (upper.includes('USD') || upper.includes('$')) return 'USD';
if (upper.includes('GBP') || upper.includes('£')) return 'GBP';
return 'EUR';
}
/**
* Extract JSON from response
*/
function extractJsonFromResponse(response: string): Record<string, unknown> | null {
let cleanResponse = response.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
const codeBlockMatch = cleanResponse.match(/```(?:json)?\s*([\s\S]*?)```/);
const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : cleanResponse;
try {
return JSON.parse(jsonStr);
} catch {
const jsonMatch = jsonStr.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
return JSON.parse(jsonMatch[0]);
} catch {
return null;
}
}
return null;
}
}
/**
* Parse JSON response into IInvoice
*/
function parseJsonToInvoice(response: string): IInvoice | null {
const parsed = extractJsonFromResponse(response);
if (!parsed) return null;
return {
invoice_number: extractInvoiceNumber(String(parsed.invoice_number || '')),
invoice_date: extractDate(String(parsed.invoice_date || '')),
vendor_name: String(parsed.vendor_name || '').replace(/\*\*/g, '').replace(/`/g, '').trim(),
currency: extractCurrency(String(parsed.currency || '')),
net_amount: parseAmount(parsed.net_amount as string | number),
vat_amount: parseAmount(parsed.vat_amount as string | number),
total_amount: parseAmount(parsed.total_amount as string | number),
};
}
/**
* Extract invoice from markdown using Qwen3
*/
async function extractInvoiceFromMarkdown(markdown: string, queryId: string): Promise<IInvoice | null> {
console.log(` [${queryId}] Sending to ${QWEN_MODEL}...`);
const startTime = Date.now();
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
signal: AbortSignal.timeout(600000), // 10 minute timeout for large documents
body: JSON.stringify({
model: QWEN_MODEL,
messages: [{
role: 'user',
content: JSON_EXTRACTION_PROMPT + markdown,
}],
stream: false,
options: {
num_predict: 2000,
temperature: 0.1,
},
}),
});
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
if (!response.ok) {
console.log(` [${queryId}] ERROR: ${response.status} (${elapsed}s)`);
throw new Error(`Ollama API error: ${response.status}`);
}
const data = await response.json();
const content = (data.message?.content || '').trim();
console.log(` [${queryId}] Response: ${content.length} chars (${elapsed}s)`);
return parseJsonToInvoice(content);
}
/**
* Compare two invoices for consensus
*/
function invoicesMatch(a: IInvoice, b: IInvoice): boolean {
const numMatch = a.invoice_number.toLowerCase() === b.invoice_number.toLowerCase();
const dateMatch = a.invoice_date === b.invoice_date;
const totalMatch = Math.abs(a.total_amount - b.total_amount) < 0.02;
return numMatch && dateMatch && totalMatch;
}
/**
* Extract with consensus
*/
async function extractWithConsensus(markdown: string, docName: string): Promise<IInvoice> {
const MAX_ATTEMPTS = 3;
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
console.log(` [${docName}] Attempt ${attempt}/${MAX_ATTEMPTS}`);
const inv1 = await extractInvoiceFromMarkdown(markdown, `${docName}-A${attempt}Q1`);
const inv2 = await extractInvoiceFromMarkdown(markdown, `${docName}-A${attempt}Q2`);
if (!inv1 || !inv2) {
console.log(` [${docName}] Parsing failed, retrying...`);
continue;
}
console.log(` [${docName}] Q1: ${inv1.invoice_number} | ${inv1.invoice_date} | ${inv1.total_amount}`);
console.log(` [${docName}] Q2: ${inv2.invoice_number} | ${inv2.invoice_date} | ${inv2.total_amount}`);
if (invoicesMatch(inv1, inv2)) {
console.log(` [${docName}] CONSENSUS`);
return inv2;
}
console.log(` [${docName}] No consensus`);
}
// Fallback
const fallback = await extractInvoiceFromMarkdown(markdown, `${docName}-FALLBACK`);
if (fallback) {
console.log(` [${docName}] FALLBACK: ${fallback.invoice_number} | ${fallback.invoice_date} | ${fallback.total_amount}`);
return fallback;
}
return {
invoice_number: '',
invoice_date: '',
vendor_name: '',
currency: 'EUR',
net_amount: 0,
vat_amount: 0,
total_amount: 0,
};
}
/**
* Normalize date to YYYY-MM-DD
*/
function normalizeDate(dateStr: string | null): string {
if (!dateStr) return '';
if (/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) return dateStr;
const monthMap: Record<string, string> = {
JAN: '01', FEB: '02', MAR: '03', APR: '04', MAY: '05', JUN: '06',
JUL: '07', AUG: '08', SEP: '09', OCT: '10', NOV: '11', DEC: '12',
};
let match = dateStr.match(/^(\d{1,2})-([A-Z]{3})-(\d{4})$/i);
if (match) {
return `${match[3]}-${monthMap[match[2].toUpperCase()] || '01'}-${match[1].padStart(2, '0')}`;
}
match = dateStr.match(/^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/);
if (match) {
return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`;
}
return dateStr;
}
/**
* Compare extracted invoice against expected
*/
function compareInvoice(
extracted: IInvoice,
expected: IInvoice
): { match: boolean; errors: string[] } {
const errors: string[] = [];
const extNum = extracted.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
const expNum = expected.invoice_number?.replace(/\s/g, '').toLowerCase() || '';
if (extNum !== expNum) {
errors.push(`invoice_number: exp "${expected.invoice_number}", got "${extracted.invoice_number}"`);
}
if (normalizeDate(extracted.invoice_date) !== normalizeDate(expected.invoice_date)) {
errors.push(`invoice_date: exp "${expected.invoice_date}", got "${extracted.invoice_date}"`);
}
if (Math.abs(extracted.total_amount - expected.total_amount) > 0.02) {
errors.push(`total_amount: exp ${expected.total_amount}, got ${extracted.total_amount}`);
}
if (extracted.currency?.toUpperCase() !== expected.currency?.toUpperCase()) {
errors.push(`currency: exp "${expected.currency}", got "${extracted.currency}"`);
}
return { match: errors.length === 0, errors };
}
/**
* Find all test cases
*/
function findTestCases(): ITestCase[] {
const testDir = path.join(process.cwd(), '.nogit/invoices');
if (!fs.existsSync(testDir)) return [];
const files = fs.readdirSync(testDir);
const testCases: ITestCase[] = [];
for (const pdf of files.filter((f) => f.endsWith('.pdf'))) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases.sort((a, b) => a.name.localeCompare(b.name));
}
// ============ TESTS ============
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} invoice test cases\n`);
// Ensure temp directory exists
if (!fs.existsSync(TEMP_MD_DIR)) {
fs.mkdirSync(TEMP_MD_DIR, { recursive: true });
}
// -------- STAGE 1: OCR with Nanonets --------
tap.test('Stage 1: Setup Nanonets', async () => {
console.log('\n========== STAGE 1: Nanonets OCR ==========\n');
const ok = await ensureNanonetsOcr();
expect(ok).toBeTrue();
});
tap.test('Stage 1: Convert all invoices to markdown', async () => {
console.log('\n Converting all invoice PDFs to markdown with Nanonets-OCR-s...\n');
for (const tc of testCases) {
console.log(`\n === ${tc.name} ===`);
const images = convertPdfToImages(tc.pdfPath);
console.log(` Pages: ${images.length}`);
const markdown = await convertDocumentToMarkdown(images, tc.name);
const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
fs.writeFileSync(mdPath, markdown);
tc.markdownPath = mdPath;
console.log(` Saved: ${mdPath}`);
}
console.log('\n Stage 1 complete: All invoices converted to markdown\n');
});
tap.test('Stage 1: Stop Nanonets', async () => {
stopNanonets();
await new Promise(resolve => setTimeout(resolve, 3000));
expect(isContainerRunning('nanonets-test')).toBeFalse();
});
// -------- STAGE 2: Extraction with Qwen3 --------
tap.test('Stage 2: Setup Ollama + Qwen3', async () => {
console.log('\n========== STAGE 2: Qwen3 Extraction ==========\n');
const ollamaOk = await ensureMiniCpm();
expect(ollamaOk).toBeTrue();
const qwenOk = await ensureQwen3();
expect(qwenOk).toBeTrue();
});
let passedCount = 0;
let failedCount = 0;
const processingTimes: number[] = [];
for (const tc of testCases) {
tap.test(`Stage 2: Extract ${tc.name}`, async () => {
const expected: IInvoice = JSON.parse(fs.readFileSync(tc.jsonPath, 'utf-8'));
console.log(`\n === ${tc.name} ===`);
console.log(` Expected: ${expected.invoice_number} | ${expected.invoice_date} | ${expected.total_amount} ${expected.currency}`);
const startTime = Date.now();
const mdPath = path.join(TEMP_MD_DIR, `${tc.name}.md`);
if (!fs.existsSync(mdPath)) {
throw new Error(`Markdown not found: ${mdPath}. Run Stage 1 first.`);
}
const markdown = fs.readFileSync(mdPath, 'utf-8');
console.log(` Markdown: ${markdown.length} chars`);
const extracted = await extractWithConsensus(markdown, tc.name);
const elapsedMs = Date.now() - startTime;
processingTimes.push(elapsedMs);
console.log(` Extracted: ${extracted.invoice_number} | ${extracted.invoice_date} | ${extracted.total_amount} ${extracted.currency}`);
const result = compareInvoice(extracted, expected);
if (result.match) {
passedCount++;
console.log(` Result: MATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
} else {
failedCount++;
console.log(` Result: MISMATCH (${(elapsedMs / 1000).toFixed(1)}s)`);
result.errors.forEach(e => console.log(` - ${e}`));
}
expect(result.match).toBeTrue();
});
}
tap.test('Summary', async () => {
const totalInvoices = testCases.length;
const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0;
const totalTimeMs = processingTimes.reduce((a, b) => a + b, 0);
const avgTimeSec = processingTimes.length > 0 ? totalTimeMs / processingTimes.length / 1000 : 0;
console.log(`\n========================================`);
console.log(` Invoice Summary (Nanonets + Qwen3)`);
console.log(`========================================`);
console.log(` Stage 1: Nanonets-OCR-s (doc -> md)`);
console.log(` Stage 2: Qwen3 8B (md -> JSON)`);
console.log(` Passed: ${passedCount}/${totalInvoices}`);
console.log(` Failed: ${failedCount}/${totalInvoices}`);
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);
console.log(`----------------------------------------`);
console.log(` Total time: ${(totalTimeMs / 1000).toFixed(1)}s`);
console.log(` Avg per inv: ${avgTimeSec.toFixed(1)}s`);
console.log(`========================================\n`);
// Cleanup temp files
try {
fs.rmSync(TEMP_MD_DIR, { recursive: true, force: true });
console.log(` Cleaned up temp directory: ${TEMP_MD_DIR}\n`);
} catch {
// Ignore
}
});
export default tap.start();