feat(vision): process pages separately and make Qwen3-VL vision extraction more robust; add per-page parsing, safer JSON handling, reduced token usage, and multi-query invoice extraction

This commit is contained in:
2026-01-18 04:50:57 +00:00
parent 63d72a52c9
commit e76768da55
3 changed files with 96 additions and 68 deletions

View File

@@ -53,23 +53,14 @@ function convertPdfToImages(pdfPath: string): string[] {
}
/**
* Extract transactions using Qwen3-VL vision
* Extract transactions from a single page
* Processes one page at a time to minimize thinking tokens
*/
async function extractTransactions(images: string[]): Promise<ITransaction[]> {
console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`);
async function extractTransactionsFromPage(image: string, pageNum: number): Promise<ITransaction[]> {
const prompt = `/no_think
Extract ALL transactions from this bank statement.
Amount format:
- "- 21,47 €" = DEBIT = -21.47
- "+ 1.000,00 €" = CREDIT = 1000.00
- European format: comma is decimal separator
For each transaction: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Return ONLY a JSON array, no explanation:
[{"date":"...","counterparty":"...","amount":0},...]`;
Extract transactions from this bank statement page.
Amount: "- 21,47 €" = -21.47, "+ 1.000,00 €" = 1000.00 (European format)
Return JSON array only: [{"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47},...]`;
const response = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
@@ -79,26 +70,28 @@ Return ONLY a JSON array, no explanation:
messages: [{
role: 'user',
content: prompt,
images: images,
images: [image],
}],
stream: false,
think: false,
options: {
num_predict: 8000,
num_predict: 4000,
temperature: 0.1,
},
}),
});
if (!response.ok) {
const err = await response.text();
throw new Error(`Ollama API error: ${response.status} - ${err}`);
throw new Error(`Ollama API error: ${response.status}`);
}
const data = await response.json();
let content = data.message?.content || '';
console.log(` [Vision] Got ${content.length} chars`);
if (!content) {
console.log(` [Page ${pageNum}] Empty response`);
return [];
}
// Parse JSON array
if (content.startsWith('```json')) content = content.slice(7);
@@ -110,10 +103,37 @@ Return ONLY a JSON array, no explanation:
const endIdx = content.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error(`No JSON array found: ${content.substring(0, 300)}`);
console.log(` [Page ${pageNum}] No JSON array found`);
return [];
}
return JSON.parse(content.substring(startIdx, endIdx));
try {
const transactions = JSON.parse(content.substring(startIdx, endIdx));
console.log(` [Page ${pageNum}] Found ${transactions.length} transactions`);
return transactions;
} catch {
console.log(` [Page ${pageNum}] JSON parse error`);
return [];
}
}
/**
* Extract transactions using Qwen3-VL vision
* Processes each page separately to avoid thinking token exhaustion
*/
async function extractTransactions(images: string[]): Promise<ITransaction[]> {
console.log(` [Vision] Processing ${images.length} page(s) with Qwen3-VL`);
const allTransactions: ITransaction[] = [];
// Process pages sequentially to avoid overwhelming the model
for (let i = 0; i < images.length; i++) {
const pageTransactions = await extractTransactionsFromPage(images[i], i + 1);
allTransactions.push(...pageTransactions);
}
console.log(` [Vision] Total: ${allTransactions.length} transactions`);
return allTransactions;
}
/**