feat(invoices): add hybrid OCR + vision invoice/document parsing with PaddleOCR, consensus voting, and prompt/test refactors
This commit is contained in:
@@ -6,8 +6,11 @@ import * as os from 'os';
|
||||
|
||||
const OLLAMA_URL = 'http://localhost:11434';
|
||||
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
|
||||
const PADDLEOCR_URL = 'http://localhost:5000';
|
||||
|
||||
const EXTRACT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
|
||||
// Prompt for visual extraction (with images)
|
||||
const VISUAL_EXTRACT_PROMPT = `/nothink
|
||||
You are a bank statement parser. Extract EVERY transaction from the table.
|
||||
|
||||
Read the Amount column carefully:
|
||||
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||
@@ -18,6 +21,60 @@ For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||
|
||||
Do not skip any rows. Return ONLY the JSON array, no explanation.`;
|
||||
|
||||
// Prompt for OCR-only extraction (no images)
|
||||
const OCR_EXTRACT_PROMPT = `/nothink
|
||||
You are a bank statement parser. Extract EVERY transaction from the OCR text below.
|
||||
|
||||
Read the Amount values carefully:
|
||||
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
||||
- European format: comma = decimal point
|
||||
|
||||
For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||
|
||||
Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
|
||||
|
||||
/**
|
||||
* Build prompt for OCR-only extraction (no images)
|
||||
*/
|
||||
function buildOcrOnlyPrompt(ocrText: string): string {
|
||||
// Limit OCR text to prevent context overflow
|
||||
const maxOcrLength = 12000;
|
||||
const truncatedOcr = ocrText.length > maxOcrLength
|
||||
? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
|
||||
: ocrText;
|
||||
|
||||
return `${OCR_EXTRACT_PROMPT}
|
||||
|
||||
OCR text from bank statement:
|
||||
---
|
||||
${truncatedOcr}
|
||||
---`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract OCR text from an image using PaddleOCR
|
||||
*/
|
||||
async function extractOcrText(imageBase64: string): Promise<string> {
|
||||
try {
|
||||
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ image: imageBase64 }),
|
||||
});
|
||||
|
||||
if (!response.ok) return '';
|
||||
|
||||
const data = await response.json();
|
||||
if (data.success && data.results) {
|
||||
return data.results.map((r: { text: string }) => r.text).join('\n');
|
||||
}
|
||||
} catch {
|
||||
// PaddleOCR unavailable
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
interface ITransaction {
|
||||
date: string;
|
||||
counterparty: string;
|
||||
@@ -53,12 +110,12 @@ function convertPdfToImages(pdfPath: string): string[] {
|
||||
}
|
||||
|
||||
/**
|
||||
* Single extraction pass
|
||||
* Visual extraction pass (with images)
|
||||
*/
|
||||
async function extractOnce(images: string[], passNum: number): Promise<ITransaction[]> {
|
||||
async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
|
||||
const payload = {
|
||||
model: MODEL,
|
||||
prompt: EXTRACT_PROMPT,
|
||||
prompt: VISUAL_EXTRACT_PROMPT,
|
||||
images,
|
||||
stream: true,
|
||||
options: {
|
||||
@@ -67,6 +124,31 @@ async function extractOnce(images: string[], passNum: number): Promise<ITransact
|
||||
},
|
||||
};
|
||||
|
||||
return doExtraction(payload, passLabel);
|
||||
}
|
||||
|
||||
/**
|
||||
* OCR-only extraction pass (no images, just text)
|
||||
*/
|
||||
async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
|
||||
const payload = {
|
||||
model: MODEL,
|
||||
prompt: buildOcrOnlyPrompt(ocrText),
|
||||
stream: true,
|
||||
options: {
|
||||
num_predict: 16384,
|
||||
temperature: 0.1,
|
||||
},
|
||||
};
|
||||
|
||||
return doExtraction(payload, passLabel);
|
||||
}
|
||||
|
||||
/**
|
||||
* Common extraction logic
|
||||
*/
|
||||
async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
|
||||
|
||||
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
@@ -86,7 +168,7 @@ async function extractOnce(images: string[], passNum: number): Promise<ITransact
|
||||
let fullText = '';
|
||||
let lineBuffer = '';
|
||||
|
||||
console.log(`[Pass ${passNum}] Extracting...`);
|
||||
console.log(`[${passLabel}] Extracting...`);
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
@@ -144,30 +226,78 @@ function hashTransactions(transactions: ITransaction[]): string {
|
||||
|
||||
/**
|
||||
* Extract with majority voting - run until 2 passes match
|
||||
* Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
|
||||
*/
|
||||
async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
|
||||
const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
|
||||
const hashCounts: Map<string, number> = new Map();
|
||||
|
||||
for (let pass = 1; pass <= maxPasses; pass++) {
|
||||
const transactions = await extractOnce(images, pass);
|
||||
const addResult = (transactions: ITransaction[], passLabel: string): number => {
|
||||
const hash = hashTransactions(transactions);
|
||||
|
||||
results.push({ transactions, hash });
|
||||
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
|
||||
console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
|
||||
return hashCounts.get(hash)!;
|
||||
};
|
||||
|
||||
console.log(`[Pass ${pass}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
|
||||
// Run Pass 1 (Visual) in parallel with OCR extraction
|
||||
let ocrText = '';
|
||||
const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));
|
||||
|
||||
// Check if we have consensus (2+ matching)
|
||||
const count = hashCounts.get(hash)!;
|
||||
if (count >= 2) {
|
||||
console.log(`[Consensus] Reached after ${pass} passes (${count} matching results)`);
|
||||
return transactions;
|
||||
// Extract OCR from all pages
|
||||
const ocrPromise = (async () => {
|
||||
const ocrTexts: string[] = [];
|
||||
for (let i = 0; i < images.length; i++) {
|
||||
const pageOcr = await extractOcrText(images[i]);
|
||||
if (pageOcr) {
|
||||
ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
|
||||
}
|
||||
}
|
||||
ocrText = ocrTexts.join('\n\n');
|
||||
if (ocrText) {
|
||||
console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
|
||||
}
|
||||
return ocrText;
|
||||
})();
|
||||
|
||||
// Wait for Pass 1 and OCR to complete
|
||||
const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
|
||||
|
||||
// Process Pass 1 result
|
||||
if ('error' in pass1Result) {
|
||||
console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
|
||||
} else {
|
||||
addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
|
||||
}
|
||||
|
||||
// Pass 2: OCR-only (no images) - faster, different approach
|
||||
if (ocrText) {
|
||||
try {
|
||||
const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
|
||||
const count = addResult(pass2Result, 'Pass 2 OCR-only');
|
||||
if (count >= 2) {
|
||||
console.log(`[Consensus] Visual and OCR extractions match!`);
|
||||
return pass2Result;
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(`[Pass 2 OCR-only] Error: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Continue with visual passes 3+ if no consensus yet
|
||||
for (let pass = 3; pass <= maxPasses; pass++) {
|
||||
try {
|
||||
const transactions = await extractVisual(images, `Pass ${pass} Visual`);
|
||||
const count = addResult(transactions, `Pass ${pass} Visual`);
|
||||
|
||||
if (count >= 2) {
|
||||
console.log(`[Consensus] Reached after ${pass} passes`);
|
||||
return transactions;
|
||||
}
|
||||
|
||||
// After 2 passes, if no match yet, continue
|
||||
if (pass >= 2) {
|
||||
console.log(`[Pass ${pass}] No consensus yet, trying again...`);
|
||||
} catch (err) {
|
||||
console.log(`[Pass ${pass}] Error: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,6 +311,10 @@ async function extractWithConsensus(images: string[], maxPasses: number = 5): Pr
|
||||
}
|
||||
}
|
||||
|
||||
if (!bestHash) {
|
||||
throw new Error('No valid results obtained');
|
||||
}
|
||||
|
||||
const best = results.find((r) => r.hash === bestHash)!;
|
||||
console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
|
||||
return best.transactions;
|
||||
|
||||
Reference in New Issue
Block a user