feat(vision): add Qwen3-VL vision model support with Dockerfile and tests; improve invoice OCR conversion and prompts; simplify extraction flow by removing consensus voting
This commit is contained in:
@@ -89,25 +89,13 @@ async function parseDocument(imageBase64: string): Promise<string> {
|
||||
return data.result?.html || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML to remove OCR artifacts that confuse the LLM
|
||||
* Minimal cleaning - only remove truly problematic patterns
|
||||
*/
|
||||
function sanitizeHtml(html: string): string {
|
||||
// Remove excessively repeated characters (OCR glitches)
|
||||
let sanitized = html.replace(/(\d)\1{20,}/g, '$1...');
|
||||
// Remove extremely long strings (corrupted data)
|
||||
sanitized = sanitized.replace(/\b[A-Za-z0-9]{50,}\b/g, '[OCR_ARTIFACT]');
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract invoice fields using simple direct prompt
|
||||
* The OCR output has clearly labeled fields - just ask the LLM to read them
|
||||
*/
|
||||
async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
|
||||
const sanitized = sanitizeHtml(html);
|
||||
const truncated = sanitized.length > 32000 ? sanitized.slice(0, 32000) : sanitized;
|
||||
// OCR output is already good - just truncate if too long
|
||||
const truncated = html.length > 32000 ? html.slice(0, 32000) : html;
|
||||
console.log(` [Extract] ${truncated.length} chars of HTML`);
|
||||
|
||||
// JSON schema for structured output
|
||||
|
||||
Reference in New Issue
Block a user