feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting

2026-01-18 03:35:05 +00:00
parent d237ad19f4
commit 3780105c6f
6 changed files with 435 additions and 70 deletions
--- a/test/test.invoices.paddleocr-vl.ts
+++ b/test/test.invoices.paddleocr-vl.ts
@@ -89,25 +89,13 @@ async function parseDocument(imageBase64: string): Promise<string> {
  return data.result?.html || '';
 }

-/**
- * Sanitize HTML to remove OCR artifacts that confuse the LLM
- * Minimal cleaning - only remove truly problematic patterns
- */
-function sanitizeHtml(html: string): string {
-  // Remove excessively repeated characters (OCR glitches)
-  let sanitized = html.replace(/(\d)\1{20,}/g, '$1...');
-  // Remove extremely long strings (corrupted data)
-  sanitized = sanitized.replace(/\b[A-Za-z0-9]{50,}\b/g, '[OCR_ARTIFACT]');
-  return sanitized;
-}
-
 /**
 * Extract invoice fields using simple direct prompt
 * The OCR output has clearly labeled fields - just ask the LLM to read them
 */
 async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
-  const sanitized = sanitizeHtml(html);
-  const truncated = sanitized.length > 32000 ? sanitized.slice(0, 32000) : sanitized;
+  // OCR output is already good - just truncate if too long
+  const truncated = html.length > 32000 ? html.slice(0, 32000) : html;
  console.log(`    [Extract] ${truncated.length} chars of HTML`);

  // JSON schema for structured output