feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting

2026-01-18 03:35:05 +00:00
parent d237ad19f4
commit 3780105c6f
6 changed files with 435 additions and 70 deletions
--- a/26
+++ b/26
@@ -0,0 +1,26 @@
+# Qwen3-VL-30B-A3B Vision Language Model
+# Q4_K_M quantization (~20GB model)
+#
+# Most powerful Qwen vision model:
+# - 256K context (expandable to 1M)
+# - Visual agent capabilities
+# - Code generation from images
+#
+# Build: docker build -f Dockerfile_qwen3vl -t qwen3vl .
+# Run:   docker run --gpus all -p 11434:11434 -v ht-ollama-models:/root/.ollama qwen3vl
+
+FROM ollama/ollama:latest
+
+# Pre-pull the model during build (optional - can also pull at runtime)
+# This makes the image larger but faster to start
+# RUN ollama serve & sleep 5 && ollama pull qwen3-vl:30b-a3b && pkill ollama
+
+# Expose Ollama API port
+EXPOSE 11434
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+  CMD curl -f http://localhost:11434/api/tags || exit 1
+
+# Start Ollama server
+CMD ["serve"]