This commit is contained in:
2026-01-16 03:58:39 +00:00
parent 6e464cb7e7
commit 3dc1881d8b
6 changed files with 8815 additions and 1 deletions

View File

@@ -1,6 +1,7 @@
{
"name": "@host.today/ht-docker-ai",
"version": "1.0.0",
"type": "module",
"private": false,
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
"main": "dist_ts/index.js",
@@ -9,7 +10,11 @@
"license": "MIT",
"scripts": {
"build": "./build-images.sh",
"test": "./test-images.sh"
"test": "tstest test/ --verbose"
},
"devDependencies": {
"@git.zone/tstest": "^1.0.90",
"@git.zone/tsrun": "^1.3.3"
},
"repository": {
"type": "git",

8414
pnpm-lock.yaml generated Normal file

File diff suppressed because it is too large Load Diff

129
recipes/document.md Normal file
View File

@@ -0,0 +1,129 @@
# Bank Statement Parsing with MiniCPM-V 4.5
Recipe for extracting transactions from bank statement PDFs using vision-language AI.
## Model
- **Model**: MiniCPM-V 4.5 (8B parameters)
- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0`
- **Quantization**: Q8_0 (9.8GB VRAM)
- **Runtime**: Ollama on GPU
## Image Conversion
Convert PDF to PNG at 300 DPI for optimal OCR accuracy.
```bash
convert -density 300 -quality 100 input.pdf \
-background white -alpha remove \
output-%d.png
```
**Parameters:**
- `-density 300`: 300 DPI resolution (critical for accuracy)
- `-quality 100`: Maximum quality
- `-background white -alpha remove`: Remove transparency
- `output-%d.png`: Outputs page-0.png, page-1.png, etc.
**Dependencies:**
```bash
apt-get install imagemagick
```
## Prompt
```
You are a bank statement parser. Extract EVERY transaction from the table.
Read the Amount column carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any rows. Return complete JSON array:
```
## API Call
```python
import base64
import requests
# Load images
with open('page-0.png', 'rb') as f:
page0 = base64.b64encode(f.read()).decode('utf-8')
with open('page-1.png', 'rb') as f:
page1 = base64.b64encode(f.read()).decode('utf-8')
payload = {
"model": "openbmb/minicpm-v4.5:q8_0",
"prompt": prompt,
"images": [page0, page1], # Multiple pages supported
"stream": False,
"options": {
"num_predict": 16384,
"temperature": 0.1
}
}
response = requests.post(
'http://localhost:11434/api/generate',
json=payload,
timeout=600
)
result = response.json()['response']
```
## Output Format
```json
[
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47},
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06},
{"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00}
]
```
## Running the Container
**GPU (recommended):**
```bash
docker run -d --gpus all -p 11434:11434 \
-v ollama-data:/root/.ollama \
-e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
ht-docker-ai:minicpm45v
```
**CPU (slower):**
```bash
docker run -d -p 11434:11434 \
-v ollama-data:/root/.ollama \
-e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
ht-docker-ai:minicpm45v-cpu
```
## Hardware Requirements
| Quantization | VRAM/RAM | Speed |
|--------------|----------|-------|
| Q8_0 (GPU) | 10GB | Fast |
| Q4_0 (CPU) | 8GB | Slow |
## Test Results
| Statement | Pages | Transactions | Accuracy |
|-----------|-------|--------------|----------|
| bunq-2022-04 | 2 | 26 | 100% |
| bunq-2021-06 | 3 | 28 | 100% |
## Tips
1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal
2. **PNG over JPEG**: PNG preserves text clarity better
3. **Remove alpha**: Some models struggle with transparency
4. **Multi-page**: Pass all pages in single request for context
5. **Temperature 0.1**: Low temperature for consistent output
6. **European format**: Explicitly explain comma=decimal in prompt

253
test/test.node.ts Normal file
View File

@@ -0,0 +1,253 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
const OLLAMA_URL = 'http://localhost:11434';
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
const BANK_STATEMENT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
Read the Amount column carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
Do not skip any rows. Return complete JSON array:`;
interface ITransaction {
date: string;
counterparty: string;
amount: number;
}
/**
* Convert PDF to PNG images using ImageMagick
*/
function convertPdfToImages(pdfPath: string): string[] {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
const outputPattern = path.join(tempDir, 'page-%d.png');
try {
execSync(
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
{ stdio: 'pipe' }
);
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
const images: string[] = [];
for (const file of files) {
const imagePath = path.join(tempDir, file);
const imageData = fs.readFileSync(imagePath);
images.push(imageData.toString('base64'));
}
return images;
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Extract transactions from images using Ollama with streaming
*/
async function extractTransactionsStreaming(images: string[]): Promise<ITransaction[]> {
const payload = {
model: MODEL,
prompt: BANK_STATEMENT_PROMPT,
images,
stream: true,
options: {
num_predict: 16384,
temperature: 0.1,
},
};
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let fullText = '';
let lineBuffer = '';
// Stream and print output (buffer until newline for cleaner display)
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const json = JSON.parse(line);
if (json.response) {
fullText += json.response;
lineBuffer += json.response;
// Print complete lines
if (lineBuffer.includes('\n')) {
const parts = lineBuffer.split('\n');
for (let i = 0; i < parts.length - 1; i++) {
console.log(parts[i]);
}
lineBuffer = parts[parts.length - 1];
}
}
} catch {
// Skip invalid JSON lines
}
}
}
// Print any remaining buffer
if (lineBuffer) {
console.log(lineBuffer);
}
console.log('');
// Parse JSON from response
const startIdx = fullText.indexOf('[');
const endIdx = fullText.lastIndexOf(']') + 1;
if (startIdx < 0 || endIdx <= startIdx) {
throw new Error('No JSON array found in response');
}
return JSON.parse(fullText.substring(startIdx, endIdx));
}
/**
* Compare extracted transactions against expected
*/
function compareTransactions(
extracted: ITransaction[],
expected: ITransaction[]
): { matches: number; total: number; errors: string[] } {
const errors: string[] = [];
let matches = 0;
for (let i = 0; i < expected.length; i++) {
const exp = expected[i];
const ext = extracted[i];
if (!ext) {
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
continue;
}
const dateMatch = ext.date === exp.date;
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
if (dateMatch && amountMatch) {
matches++;
} else {
errors.push(
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
);
}
}
if (extracted.length > expected.length) {
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
}
return { matches, total: expected.length, errors };
}
/**
* Find all test cases (PDF + JSON pairs) in .nogit/
*/
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
const testDir = path.join(process.cwd(), '.nogit');
if (!fs.existsSync(testDir)) {
return [];
}
const files = fs.readdirSync(testDir);
const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
for (const pdf of pdfFiles) {
const baseName = pdf.replace('.pdf', '');
const jsonFile = `${baseName}.json`;
if (files.includes(jsonFile)) {
testCases.push({
name: baseName,
pdfPath: path.join(testDir, pdf),
jsonPath: path.join(testDir, jsonFile),
});
}
}
return testCases;
}
// Tests
tap.test('should connect to Ollama API', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
expect(response.ok).toBeTrue();
const data = await response.json();
expect(data.models).toBeArray();
});
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
const response = await fetch(`${OLLAMA_URL}/api/tags`);
const data = await response.json();
const modelNames = data.models.map((m: { name: string }) => m.name);
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
});
// Dynamic test for each PDF/JSON pair
const testCases = findTestCases();
for (const testCase of testCases) {
tap.test(`should extract transactions from ${testCase.name}`, async () => {
// Load expected transactions
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
console.log(`\n=== ${testCase.name} ===`);
console.log(`Expected: ${expected.length} transactions`);
// Convert PDF to images
console.log('Converting PDF to images...');
const images = convertPdfToImages(testCase.pdfPath);
console.log(`Converted: ${images.length} pages`);
// Extract transactions with streaming output
console.log('Extracting transactions (streaming)...\n');
const extracted = await extractTransactionsStreaming(images);
console.log(`Extracted: ${extracted.length} transactions`);
// Compare results
const result = compareTransactions(extracted, expected);
console.log(`Matches: ${result.matches}/${result.total}`);
if (result.errors.length > 0) {
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
}
// Assert high accuracy
const accuracy = result.matches / result.total;
expect(accuracy).toBeGreaterThan(0.95);
expect(extracted.length).toEqual(expected.length);
});
}
export default tap.start();

13
tsconfig.json Normal file
View File

@@ -0,0 +1,13 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"esModuleInterop": true,
"strict": true,
"skipLibCheck": true,
"outDir": "./dist_ts",
"declaration": true
},
"include": ["ts/**/*", "test/**/*"]
}