update
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"name": "@host.today/ht-docker-ai",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"private": false,
|
||||
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
|
||||
"main": "dist_ts/index.js",
|
||||
@@ -9,7 +10,11 @@
|
||||
"license": "MIT",
|
||||
"scripts": {
|
||||
"build": "./build-images.sh",
|
||||
"test": "./test-images.sh"
|
||||
"test": "tstest test/ --verbose"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@git.zone/tstest": "^1.0.90",
|
||||
"@git.zone/tsrun": "^1.3.3"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
||||
8414
pnpm-lock.yaml
generated
Normal file
8414
pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
129
recipes/document.md
Normal file
129
recipes/document.md
Normal file
@@ -0,0 +1,129 @@
|
||||
# Bank Statement Parsing with MiniCPM-V 4.5
|
||||
|
||||
Recipe for extracting transactions from bank statement PDFs using vision-language AI.
|
||||
|
||||
## Model
|
||||
|
||||
- **Model**: MiniCPM-V 4.5 (8B parameters)
|
||||
- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0`
|
||||
- **Quantization**: Q8_0 (9.8GB VRAM)
|
||||
- **Runtime**: Ollama on GPU
|
||||
|
||||
## Image Conversion
|
||||
|
||||
Convert PDF to PNG at 300 DPI for optimal OCR accuracy.
|
||||
|
||||
```bash
|
||||
convert -density 300 -quality 100 input.pdf \
|
||||
-background white -alpha remove \
|
||||
output-%d.png
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `-density 300`: 300 DPI resolution (critical for accuracy)
|
||||
- `-quality 100`: Maximum quality
|
||||
- `-background white -alpha remove`: Remove transparency
|
||||
- `output-%d.png`: Outputs page-0.png, page-1.png, etc.
|
||||
|
||||
**Dependencies:**
|
||||
```bash
|
||||
apt-get install imagemagick
|
||||
```
|
||||
|
||||
## Prompt
|
||||
|
||||
```
|
||||
You are a bank statement parser. Extract EVERY transaction from the table.
|
||||
|
||||
Read the Amount column carefully:
|
||||
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
||||
- European format: comma = decimal point
|
||||
|
||||
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||
|
||||
Do not skip any rows. Return complete JSON array:
|
||||
```
|
||||
|
||||
## API Call
|
||||
|
||||
```python
|
||||
import base64
|
||||
import requests
|
||||
|
||||
# Load images
|
||||
with open('page-0.png', 'rb') as f:
|
||||
page0 = base64.b64encode(f.read()).decode('utf-8')
|
||||
with open('page-1.png', 'rb') as f:
|
||||
page1 = base64.b64encode(f.read()).decode('utf-8')
|
||||
|
||||
payload = {
|
||||
"model": "openbmb/minicpm-v4.5:q8_0",
|
||||
"prompt": prompt,
|
||||
"images": [page0, page1], # Multiple pages supported
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 16384,
|
||||
"temperature": 0.1
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
'http://localhost:11434/api/generate',
|
||||
json=payload,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
result = response.json()['response']
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```json
|
||||
[
|
||||
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47},
|
||||
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06},
|
||||
{"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00}
|
||||
]
|
||||
```
|
||||
|
||||
## Running the Container
|
||||
|
||||
**GPU (recommended):**
|
||||
```bash
|
||||
docker run -d --gpus all -p 11434:11434 \
|
||||
-v ollama-data:/root/.ollama \
|
||||
-e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
|
||||
ht-docker-ai:minicpm45v
|
||||
```
|
||||
|
||||
**CPU (slower):**
|
||||
```bash
|
||||
docker run -d -p 11434:11434 \
|
||||
-v ollama-data:/root/.ollama \
|
||||
-e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
|
||||
ht-docker-ai:minicpm45v-cpu
|
||||
```
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
| Quantization | VRAM/RAM | Speed |
|
||||
|--------------|----------|-------|
|
||||
| Q8_0 (GPU) | 10GB | Fast |
|
||||
| Q4_0 (CPU) | 8GB | Slow |
|
||||
|
||||
## Test Results
|
||||
|
||||
| Statement | Pages | Transactions | Accuracy |
|
||||
|-----------|-------|--------------|----------|
|
||||
| bunq-2022-04 | 2 | 26 | 100% |
|
||||
| bunq-2021-06 | 3 | 28 | 100% |
|
||||
|
||||
## Tips
|
||||
|
||||
1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal
|
||||
2. **PNG over JPEG**: PNG preserves text clarity better
|
||||
3. **Remove alpha**: Some models struggle with transparency
|
||||
4. **Multi-page**: Pass all pages in single request for context
|
||||
5. **Temperature 0.1**: Low temperature for consistent output
|
||||
6. **European format**: Explicitly explain comma=decimal in prompt
|
||||
253
test/test.node.ts
Normal file
253
test/test.node.ts
Normal file
@@ -0,0 +1,253 @@
|
||||
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { execSync } from 'child_process';
|
||||
import * as os from 'os';
|
||||
|
||||
const OLLAMA_URL = 'http://localhost:11434';
|
||||
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
|
||||
|
||||
const BANK_STATEMENT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
|
||||
|
||||
Read the Amount column carefully:
|
||||
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
||||
- European format: comma = decimal point
|
||||
|
||||
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||
|
||||
Do not skip any rows. Return complete JSON array:`;
|
||||
|
||||
interface ITransaction {
|
||||
date: string;
|
||||
counterparty: string;
|
||||
amount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PDF to PNG images using ImageMagick
|
||||
*/
|
||||
function convertPdfToImages(pdfPath: string): string[] {
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
|
||||
const outputPattern = path.join(tempDir, 'page-%d.png');
|
||||
|
||||
try {
|
||||
execSync(
|
||||
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
|
||||
{ stdio: 'pipe' }
|
||||
);
|
||||
|
||||
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
|
||||
const images: string[] = [];
|
||||
|
||||
for (const file of files) {
|
||||
const imagePath = path.join(tempDir, file);
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
images.push(imageData.toString('base64'));
|
||||
}
|
||||
|
||||
return images;
|
||||
} finally {
|
||||
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract transactions from images using Ollama with streaming
|
||||
*/
|
||||
async function extractTransactionsStreaming(images: string[]): Promise<ITransaction[]> {
|
||||
const payload = {
|
||||
model: MODEL,
|
||||
prompt: BANK_STATEMENT_PROMPT,
|
||||
images,
|
||||
stream: true,
|
||||
options: {
|
||||
num_predict: 16384,
|
||||
temperature: 0.1,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.status}`);
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let fullText = '';
|
||||
let lineBuffer = '';
|
||||
|
||||
// Stream and print output (buffer until newline for cleaner display)
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const chunk = decoder.decode(value, { stream: true });
|
||||
const lines = chunk.split('\n').filter((l) => l.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const json = JSON.parse(line);
|
||||
if (json.response) {
|
||||
fullText += json.response;
|
||||
lineBuffer += json.response;
|
||||
|
||||
// Print complete lines
|
||||
if (lineBuffer.includes('\n')) {
|
||||
const parts = lineBuffer.split('\n');
|
||||
for (let i = 0; i < parts.length - 1; i++) {
|
||||
console.log(parts[i]);
|
||||
}
|
||||
lineBuffer = parts[parts.length - 1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Skip invalid JSON lines
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Print any remaining buffer
|
||||
if (lineBuffer) {
|
||||
console.log(lineBuffer);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
// Parse JSON from response
|
||||
const startIdx = fullText.indexOf('[');
|
||||
const endIdx = fullText.lastIndexOf(']') + 1;
|
||||
|
||||
if (startIdx < 0 || endIdx <= startIdx) {
|
||||
throw new Error('No JSON array found in response');
|
||||
}
|
||||
|
||||
return JSON.parse(fullText.substring(startIdx, endIdx));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare extracted transactions against expected
|
||||
*/
|
||||
function compareTransactions(
|
||||
extracted: ITransaction[],
|
||||
expected: ITransaction[]
|
||||
): { matches: number; total: number; errors: string[] } {
|
||||
const errors: string[] = [];
|
||||
let matches = 0;
|
||||
|
||||
for (let i = 0; i < expected.length; i++) {
|
||||
const exp = expected[i];
|
||||
const ext = extracted[i];
|
||||
|
||||
if (!ext) {
|
||||
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const dateMatch = ext.date === exp.date;
|
||||
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
|
||||
|
||||
if (dateMatch && amountMatch) {
|
||||
matches++;
|
||||
} else {
|
||||
errors.push(
|
||||
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (extracted.length > expected.length) {
|
||||
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
|
||||
}
|
||||
|
||||
return { matches, total: expected.length, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all test cases (PDF + JSON pairs) in .nogit/
|
||||
*/
|
||||
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
|
||||
const testDir = path.join(process.cwd(), '.nogit');
|
||||
if (!fs.existsSync(testDir)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const files = fs.readdirSync(testDir);
|
||||
const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
|
||||
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
|
||||
|
||||
for (const pdf of pdfFiles) {
|
||||
const baseName = pdf.replace('.pdf', '');
|
||||
const jsonFile = `${baseName}.json`;
|
||||
if (files.includes(jsonFile)) {
|
||||
testCases.push({
|
||||
name: baseName,
|
||||
pdfPath: path.join(testDir, pdf),
|
||||
jsonPath: path.join(testDir, jsonFile),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return testCases;
|
||||
}
|
||||
|
||||
// Tests
|
||||
|
||||
tap.test('should connect to Ollama API', async () => {
|
||||
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
||||
expect(response.ok).toBeTrue();
|
||||
const data = await response.json();
|
||||
expect(data.models).toBeArray();
|
||||
});
|
||||
|
||||
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
|
||||
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
||||
const data = await response.json();
|
||||
const modelNames = data.models.map((m: { name: string }) => m.name);
|
||||
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
|
||||
});
|
||||
|
||||
// Dynamic test for each PDF/JSON pair
|
||||
const testCases = findTestCases();
|
||||
for (const testCase of testCases) {
|
||||
tap.test(`should extract transactions from ${testCase.name}`, async () => {
|
||||
// Load expected transactions
|
||||
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
|
||||
console.log(`\n=== ${testCase.name} ===`);
|
||||
console.log(`Expected: ${expected.length} transactions`);
|
||||
|
||||
// Convert PDF to images
|
||||
console.log('Converting PDF to images...');
|
||||
const images = convertPdfToImages(testCase.pdfPath);
|
||||
console.log(`Converted: ${images.length} pages`);
|
||||
|
||||
// Extract transactions with streaming output
|
||||
console.log('Extracting transactions (streaming)...\n');
|
||||
const extracted = await extractTransactionsStreaming(images);
|
||||
console.log(`Extracted: ${extracted.length} transactions`);
|
||||
|
||||
// Compare results
|
||||
const result = compareTransactions(extracted, expected);
|
||||
console.log(`Matches: ${result.matches}/${result.total}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('Errors:');
|
||||
result.errors.forEach((e) => console.log(` - ${e}`));
|
||||
}
|
||||
|
||||
// Assert high accuracy
|
||||
const accuracy = result.matches / result.total;
|
||||
expect(accuracy).toBeGreaterThan(0.95);
|
||||
expect(extracted.length).toEqual(expected.length);
|
||||
});
|
||||
}
|
||||
|
||||
export default tap.start();
|
||||
13
tsconfig.json
Normal file
13
tsconfig.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "./dist_ts",
|
||||
"declaration": true
|
||||
},
|
||||
"include": ["ts/**/*", "test/**/*"]
|
||||
}
|
||||
Reference in New Issue
Block a user