feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting

This commit is contained in:
2026-01-18 03:35:05 +00:00
parent d237ad19f4
commit 3780105c6f
6 changed files with 435 additions and 70 deletions

View File

@@ -89,25 +89,13 @@ async function parseDocument(imageBase64: string): Promise<string> {
return data.result?.html || '';
}
/**
* Sanitize HTML to remove OCR artifacts that confuse the LLM
* Minimal cleaning - only remove truly problematic patterns
*/
function sanitizeHtml(html: string): string {
// Remove excessively repeated characters (OCR glitches)
let sanitized = html.replace(/(\d)\1{20,}/g, '$1...');
// Remove extremely long strings (corrupted data)
sanitized = sanitized.replace(/\b[A-Za-z0-9]{50,}\b/g, '[OCR_ARTIFACT]');
return sanitized;
}
/**
* Extract invoice fields using simple direct prompt
* The OCR output has clearly labeled fields - just ask the LLM to read them
*/
async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
const sanitized = sanitizeHtml(html);
const truncated = sanitized.length > 32000 ? sanitized.slice(0, 32000) : sanitized;
// OCR output is already good - just truncate if too long
const truncated = html.length > 32000 ? html.slice(0, 32000) : html;
console.log(` [Extract] ${truncated.length} chars of HTML`);
// JSON schema for structured output