440 lines
12 KiB
TypeScript
440 lines
12 KiB
TypeScript
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { execSync } from 'child_process';
|
|
import * as os from 'os';
|
|
|
|
const OLLAMA_URL = 'http://localhost:11434';
|
|
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
|
|
const PADDLEOCR_URL = 'http://localhost:5000';
|
|
|
|
// Prompt for visual extraction (with images)
|
|
const VISUAL_EXTRACT_PROMPT = `/nothink
|
|
You are a bank statement parser. Extract EVERY transaction from the table.
|
|
|
|
Read the Amount column carefully:
|
|
- "- 21,47 €" means DEBIT, output as: -21.47
|
|
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
|
- European format: comma = decimal point
|
|
|
|
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
|
|
|
Do not skip any rows. Return ONLY the JSON array, no explanation.`;
|
|
|
|
// Prompt for OCR-only extraction (no images)
|
|
const OCR_EXTRACT_PROMPT = `/nothink
|
|
You are a bank statement parser. Extract EVERY transaction from the OCR text below.
|
|
|
|
Read the Amount values carefully:
|
|
- "- 21,47 €" means DEBIT, output as: -21.47
|
|
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
|
- European format: comma = decimal point
|
|
|
|
For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
|
|
|
Do not skip any transactions. Return ONLY the JSON array, no explanation.`;
|
|
|
|
/**
|
|
* Build prompt for OCR-only extraction (no images)
|
|
*/
|
|
function buildOcrOnlyPrompt(ocrText: string): string {
|
|
// Limit OCR text to prevent context overflow
|
|
const maxOcrLength = 12000;
|
|
const truncatedOcr = ocrText.length > maxOcrLength
|
|
? ocrText.substring(0, maxOcrLength) + '\n... (truncated)'
|
|
: ocrText;
|
|
|
|
return `${OCR_EXTRACT_PROMPT}
|
|
|
|
OCR text from bank statement:
|
|
---
|
|
${truncatedOcr}
|
|
---`;
|
|
}
|
|
|
|
/**
|
|
* Extract OCR text from an image using PaddleOCR
|
|
*/
|
|
async function extractOcrText(imageBase64: string): Promise<string> {
|
|
try {
|
|
const response = await fetch(`${PADDLEOCR_URL}/ocr`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ image: imageBase64 }),
|
|
});
|
|
|
|
if (!response.ok) return '';
|
|
|
|
const data = await response.json();
|
|
if (data.success && data.results) {
|
|
return data.results.map((r: { text: string }) => r.text).join('\n');
|
|
}
|
|
} catch {
|
|
// PaddleOCR unavailable
|
|
}
|
|
return '';
|
|
}
|
|
|
|
interface ITransaction {
|
|
date: string;
|
|
counterparty: string;
|
|
amount: number;
|
|
}
|
|
|
|
/**
|
|
* Convert PDF to PNG images using ImageMagick
|
|
*/
|
|
function convertPdfToImages(pdfPath: string): string[] {
|
|
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
|
|
const outputPattern = path.join(tempDir, 'page-%d.png');
|
|
|
|
try {
|
|
execSync(
|
|
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
|
|
{ stdio: 'pipe' }
|
|
);
|
|
|
|
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
|
|
const images: string[] = [];
|
|
|
|
for (const file of files) {
|
|
const imagePath = path.join(tempDir, file);
|
|
const imageData = fs.readFileSync(imagePath);
|
|
images.push(imageData.toString('base64'));
|
|
}
|
|
|
|
return images;
|
|
} finally {
|
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Visual extraction pass (with images)
|
|
*/
|
|
async function extractVisual(images: string[], passLabel: string): Promise<ITransaction[]> {
|
|
const payload = {
|
|
model: MODEL,
|
|
prompt: VISUAL_EXTRACT_PROMPT,
|
|
images,
|
|
stream: true,
|
|
options: {
|
|
num_predict: 16384,
|
|
temperature: 0.1,
|
|
},
|
|
};
|
|
|
|
return doExtraction(payload, passLabel);
|
|
}
|
|
|
|
/**
|
|
* OCR-only extraction pass (no images, just text)
|
|
*/
|
|
async function extractFromOcr(ocrText: string, passLabel: string): Promise<ITransaction[]> {
|
|
const payload = {
|
|
model: MODEL,
|
|
prompt: buildOcrOnlyPrompt(ocrText),
|
|
stream: true,
|
|
options: {
|
|
num_predict: 16384,
|
|
temperature: 0.1,
|
|
},
|
|
};
|
|
|
|
return doExtraction(payload, passLabel);
|
|
}
|
|
|
|
/**
|
|
* Common extraction logic
|
|
*/
|
|
async function doExtraction(payload: object, passLabel: string): Promise<ITransaction[]> {
|
|
|
|
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(payload),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Ollama API error: ${response.status}`);
|
|
}
|
|
|
|
const reader = response.body?.getReader();
|
|
if (!reader) {
|
|
throw new Error('No response body');
|
|
}
|
|
|
|
const decoder = new TextDecoder();
|
|
let fullText = '';
|
|
let lineBuffer = '';
|
|
|
|
console.log(`[${passLabel}] Extracting...`);
|
|
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
|
|
const chunk = decoder.decode(value, { stream: true });
|
|
const lines = chunk.split('\n').filter((l) => l.trim());
|
|
|
|
for (const line of lines) {
|
|
try {
|
|
const json = JSON.parse(line);
|
|
if (json.response) {
|
|
fullText += json.response;
|
|
lineBuffer += json.response;
|
|
|
|
// Print complete lines
|
|
if (lineBuffer.includes('\n')) {
|
|
const parts = lineBuffer.split('\n');
|
|
for (let i = 0; i < parts.length - 1; i++) {
|
|
console.log(parts[i]);
|
|
}
|
|
lineBuffer = parts[parts.length - 1];
|
|
}
|
|
}
|
|
} catch {
|
|
// Skip invalid JSON lines
|
|
}
|
|
}
|
|
}
|
|
|
|
if (lineBuffer) {
|
|
console.log(lineBuffer);
|
|
}
|
|
console.log('');
|
|
|
|
const startIdx = fullText.indexOf('[');
|
|
const endIdx = fullText.lastIndexOf(']') + 1;
|
|
|
|
if (startIdx < 0 || endIdx <= startIdx) {
|
|
throw new Error('No JSON array found in response');
|
|
}
|
|
|
|
return JSON.parse(fullText.substring(startIdx, endIdx));
|
|
}
|
|
|
|
/**
|
|
* Create a hash of transactions for comparison
|
|
*/
|
|
function hashTransactions(transactions: ITransaction[]): string {
|
|
return transactions
|
|
.map((t) => `${t.date}|${t.amount.toFixed(2)}`)
|
|
.sort()
|
|
.join(';');
|
|
}
|
|
|
|
/**
|
|
* Extract with majority voting - run until 2 passes match
|
|
* Strategy: Pass 1 = Visual (images), Pass 2 = OCR-only (text), Pass 3+ = Visual
|
|
*/
|
|
async function extractWithConsensus(images: string[], maxPasses: number = 5): Promise<ITransaction[]> {
|
|
const results: Array<{ transactions: ITransaction[]; hash: string }> = [];
|
|
const hashCounts: Map<string, number> = new Map();
|
|
|
|
const addResult = (transactions: ITransaction[], passLabel: string): number => {
|
|
const hash = hashTransactions(transactions);
|
|
results.push({ transactions, hash });
|
|
hashCounts.set(hash, (hashCounts.get(hash) || 0) + 1);
|
|
console.log(`[${passLabel}] Got ${transactions.length} transactions (hash: ${hash.substring(0, 20)}...)`);
|
|
return hashCounts.get(hash)!;
|
|
};
|
|
|
|
// Run Pass 1 (Visual) in parallel with OCR extraction
|
|
let ocrText = '';
|
|
const pass1Promise = extractVisual(images, 'Pass 1 Visual').catch((err) => ({ error: err }));
|
|
|
|
// Extract OCR from all pages
|
|
const ocrPromise = (async () => {
|
|
const ocrTexts: string[] = [];
|
|
for (let i = 0; i < images.length; i++) {
|
|
const pageOcr = await extractOcrText(images[i]);
|
|
if (pageOcr) {
|
|
ocrTexts.push(`--- Page ${i + 1} ---\n${pageOcr}`);
|
|
}
|
|
}
|
|
ocrText = ocrTexts.join('\n\n');
|
|
if (ocrText) {
|
|
console.log(`[OCR] Extracted text from ${ocrTexts.length} page(s)`);
|
|
}
|
|
return ocrText;
|
|
})();
|
|
|
|
// Wait for Pass 1 and OCR to complete
|
|
const [pass1Result] = await Promise.all([pass1Promise, ocrPromise]);
|
|
|
|
// Process Pass 1 result
|
|
if ('error' in pass1Result) {
|
|
console.log(`[Pass 1] Error: ${(pass1Result as { error: unknown }).error}`);
|
|
} else {
|
|
addResult(pass1Result as ITransaction[], 'Pass 1 Visual');
|
|
}
|
|
|
|
// Pass 2: OCR-only (no images) - faster, different approach
|
|
if (ocrText) {
|
|
try {
|
|
const pass2Result = await extractFromOcr(ocrText, 'Pass 2 OCR-only');
|
|
const count = addResult(pass2Result, 'Pass 2 OCR-only');
|
|
if (count >= 2) {
|
|
console.log(`[Consensus] Visual and OCR extractions match!`);
|
|
return pass2Result;
|
|
}
|
|
} catch (err) {
|
|
console.log(`[Pass 2 OCR-only] Error: ${err}`);
|
|
}
|
|
}
|
|
|
|
// Continue with visual passes 3+ if no consensus yet
|
|
for (let pass = 3; pass <= maxPasses; pass++) {
|
|
try {
|
|
const transactions = await extractVisual(images, `Pass ${pass} Visual`);
|
|
const count = addResult(transactions, `Pass ${pass} Visual`);
|
|
|
|
if (count >= 2) {
|
|
console.log(`[Consensus] Reached after ${pass} passes`);
|
|
return transactions;
|
|
}
|
|
|
|
console.log(`[Pass ${pass}] No consensus yet, trying again...`);
|
|
} catch (err) {
|
|
console.log(`[Pass ${pass}] Error: ${err}`);
|
|
}
|
|
}
|
|
|
|
// No consensus reached - return the most common result
|
|
let bestHash = '';
|
|
let bestCount = 0;
|
|
for (const [hash, count] of hashCounts) {
|
|
if (count > bestCount) {
|
|
bestCount = count;
|
|
bestHash = hash;
|
|
}
|
|
}
|
|
|
|
if (!bestHash) {
|
|
throw new Error('No valid results obtained');
|
|
}
|
|
|
|
const best = results.find((r) => r.hash === bestHash)!;
|
|
console.log(`[No consensus] Using most common result (${bestCount}/${maxPasses} passes)`);
|
|
return best.transactions;
|
|
}
|
|
|
|
/**
|
|
* Compare extracted transactions against expected
|
|
*/
|
|
function compareTransactions(
|
|
extracted: ITransaction[],
|
|
expected: ITransaction[]
|
|
): { matches: number; total: number; errors: string[] } {
|
|
const errors: string[] = [];
|
|
let matches = 0;
|
|
|
|
for (let i = 0; i < expected.length; i++) {
|
|
const exp = expected[i];
|
|
const ext = extracted[i];
|
|
|
|
if (!ext) {
|
|
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
|
|
continue;
|
|
}
|
|
|
|
const dateMatch = ext.date === exp.date;
|
|
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
|
|
|
|
if (dateMatch && amountMatch) {
|
|
matches++;
|
|
} else {
|
|
errors.push(
|
|
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
|
|
);
|
|
}
|
|
}
|
|
|
|
if (extracted.length > expected.length) {
|
|
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
|
|
}
|
|
|
|
return { matches, total: expected.length, errors };
|
|
}
|
|
|
|
/**
|
|
* Find all test cases (PDF + JSON pairs) in .nogit/
|
|
*/
|
|
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
|
|
const testDir = path.join(process.cwd(), '.nogit');
|
|
if (!fs.existsSync(testDir)) {
|
|
return [];
|
|
}
|
|
|
|
const files = fs.readdirSync(testDir);
|
|
const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
|
|
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
|
|
|
|
for (const pdf of pdfFiles) {
|
|
const baseName = pdf.replace('.pdf', '');
|
|
const jsonFile = `${baseName}.json`;
|
|
if (files.includes(jsonFile)) {
|
|
testCases.push({
|
|
name: baseName,
|
|
pdfPath: path.join(testDir, pdf),
|
|
jsonPath: path.join(testDir, jsonFile),
|
|
});
|
|
}
|
|
}
|
|
|
|
return testCases;
|
|
}
|
|
|
|
// Tests
|
|
|
|
tap.test('should connect to Ollama API', async () => {
|
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
|
expect(response.ok).toBeTrue();
|
|
const data = await response.json();
|
|
expect(data.models).toBeArray();
|
|
});
|
|
|
|
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
|
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
|
const data = await response.json();
|
|
const modelNames = data.models.map((m: { name: string }) => m.name);
|
|
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
|
|
});
|
|
|
|
// Dynamic test for each PDF/JSON pair
|
|
const testCases = findTestCases();
|
|
for (const testCase of testCases) {
|
|
tap.test(`should extract transactions from ${testCase.name}`, async () => {
|
|
// Load expected transactions
|
|
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
|
|
console.log(`\n=== ${testCase.name} ===`);
|
|
console.log(`Expected: ${expected.length} transactions`);
|
|
|
|
// Convert PDF to images
|
|
console.log('Converting PDF to images...');
|
|
const images = convertPdfToImages(testCase.pdfPath);
|
|
console.log(`Converted: ${images.length} pages\n`);
|
|
|
|
// Extract with consensus voting
|
|
const extracted = await extractWithConsensus(images);
|
|
console.log(`\nFinal: ${extracted.length} transactions`);
|
|
|
|
// Compare results
|
|
const result = compareTransactions(extracted, expected);
|
|
console.log(`Accuracy: ${result.matches}/${result.total}`);
|
|
|
|
if (result.errors.length > 0) {
|
|
console.log('Errors:');
|
|
result.errors.forEach((e) => console.log(` - ${e}`));
|
|
}
|
|
|
|
// Assert high accuracy
|
|
const accuracy = result.matches / result.total;
|
|
expect(accuracy).toBeGreaterThan(0.95);
|
|
expect(extracted.length).toEqual(expected.length);
|
|
});
|
|
}
|
|
|
|
export default tap.start();
|