update
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
{
|
{
|
||||||
"name": "@host.today/ht-docker-ai",
|
"name": "@host.today/ht-docker-ai",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
"private": false,
|
"private": false,
|
||||||
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
|
"description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
|
||||||
"main": "dist_ts/index.js",
|
"main": "dist_ts/index.js",
|
||||||
@@ -9,7 +10,11 @@
|
|||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "./build-images.sh",
|
"build": "./build-images.sh",
|
||||||
"test": "./test-images.sh"
|
"test": "tstest test/ --verbose"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@git.zone/tstest": "^1.0.90",
|
||||||
|
"@git.zone/tsrun": "^1.3.3"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|||||||
8414
pnpm-lock.yaml
generated
Normal file
8414
pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
129
recipes/document.md
Normal file
129
recipes/document.md
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
# Bank Statement Parsing with MiniCPM-V 4.5
|
||||||
|
|
||||||
|
Recipe for extracting transactions from bank statement PDFs using vision-language AI.
|
||||||
|
|
||||||
|
## Model
|
||||||
|
|
||||||
|
- **Model**: MiniCPM-V 4.5 (8B parameters)
|
||||||
|
- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0`
|
||||||
|
- **Quantization**: Q8_0 (9.8GB VRAM)
|
||||||
|
- **Runtime**: Ollama on GPU
|
||||||
|
|
||||||
|
## Image Conversion
|
||||||
|
|
||||||
|
Convert PDF to PNG at 300 DPI for optimal OCR accuracy.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
convert -density 300 -quality 100 input.pdf \
|
||||||
|
-background white -alpha remove \
|
||||||
|
output-%d.png
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `-density 300`: 300 DPI resolution (critical for accuracy)
|
||||||
|
- `-quality 100`: Maximum quality
|
||||||
|
- `-background white -alpha remove`: Remove transparency
|
||||||
|
- `output-%d.png`: Outputs page-0.png, page-1.png, etc.
|
||||||
|
|
||||||
|
**Dependencies:**
|
||||||
|
```bash
|
||||||
|
apt-get install imagemagick
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
```
|
||||||
|
You are a bank statement parser. Extract EVERY transaction from the table.
|
||||||
|
|
||||||
|
Read the Amount column carefully:
|
||||||
|
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||||
|
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
||||||
|
- European format: comma = decimal point
|
||||||
|
|
||||||
|
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||||
|
|
||||||
|
Do not skip any rows. Return complete JSON array:
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Call
|
||||||
|
|
||||||
|
```python
|
||||||
|
import base64
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Load images
|
||||||
|
with open('page-0.png', 'rb') as f:
|
||||||
|
page0 = base64.b64encode(f.read()).decode('utf-8')
|
||||||
|
with open('page-1.png', 'rb') as f:
|
||||||
|
page1 = base64.b64encode(f.read()).decode('utf-8')
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "openbmb/minicpm-v4.5:q8_0",
|
||||||
|
"prompt": prompt,
|
||||||
|
"images": [page0, page1], # Multiple pages supported
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"num_predict": 16384,
|
||||||
|
"temperature": 0.1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
'http://localhost:11434/api/generate',
|
||||||
|
json=payload,
|
||||||
|
timeout=600
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.json()['response']
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47},
|
||||||
|
{"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06},
|
||||||
|
{"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running the Container
|
||||||
|
|
||||||
|
**GPU (recommended):**
|
||||||
|
```bash
|
||||||
|
docker run -d --gpus all -p 11434:11434 \
|
||||||
|
-v ollama-data:/root/.ollama \
|
||||||
|
-e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
|
||||||
|
ht-docker-ai:minicpm45v
|
||||||
|
```
|
||||||
|
|
||||||
|
**CPU (slower):**
|
||||||
|
```bash
|
||||||
|
docker run -d -p 11434:11434 \
|
||||||
|
-v ollama-data:/root/.ollama \
|
||||||
|
-e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
|
||||||
|
ht-docker-ai:minicpm45v-cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
## Hardware Requirements
|
||||||
|
|
||||||
|
| Quantization | VRAM/RAM | Speed |
|
||||||
|
|--------------|----------|-------|
|
||||||
|
| Q8_0 (GPU) | 10GB | Fast |
|
||||||
|
| Q4_0 (CPU) | 8GB | Slow |
|
||||||
|
|
||||||
|
## Test Results
|
||||||
|
|
||||||
|
| Statement | Pages | Transactions | Accuracy |
|
||||||
|
|-----------|-------|--------------|----------|
|
||||||
|
| bunq-2022-04 | 2 | 26 | 100% |
|
||||||
|
| bunq-2021-06 | 3 | 28 | 100% |
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal
|
||||||
|
2. **PNG over JPEG**: PNG preserves text clarity better
|
||||||
|
3. **Remove alpha**: Some models struggle with transparency
|
||||||
|
4. **Multi-page**: Pass all pages in single request for context
|
||||||
|
5. **Temperature 0.1**: Low temperature for consistent output
|
||||||
|
6. **European format**: Explicitly explain comma=decimal in prompt
|
||||||
253
test/test.node.ts
Normal file
253
test/test.node.ts
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import { execSync } from 'child_process';
|
||||||
|
import * as os from 'os';
|
||||||
|
|
||||||
|
const OLLAMA_URL = 'http://localhost:11434';
|
||||||
|
const MODEL = 'openbmb/minicpm-v4.5:q8_0';
|
||||||
|
|
||||||
|
const BANK_STATEMENT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
|
||||||
|
|
||||||
|
Read the Amount column carefully:
|
||||||
|
- "- 21,47 €" means DEBIT, output as: -21.47
|
||||||
|
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
|
||||||
|
- European format: comma = decimal point
|
||||||
|
|
||||||
|
For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
|
||||||
|
|
||||||
|
Do not skip any rows. Return complete JSON array:`;
|
||||||
|
|
||||||
|
interface ITransaction {
|
||||||
|
date: string;
|
||||||
|
counterparty: string;
|
||||||
|
amount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert PDF to PNG images using ImageMagick
|
||||||
|
*/
|
||||||
|
function convertPdfToImages(pdfPath: string): string[] {
|
||||||
|
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
|
||||||
|
const outputPattern = path.join(tempDir, 'page-%d.png');
|
||||||
|
|
||||||
|
try {
|
||||||
|
execSync(
|
||||||
|
`convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
|
||||||
|
{ stdio: 'pipe' }
|
||||||
|
);
|
||||||
|
|
||||||
|
const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
|
||||||
|
const images: string[] = [];
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const imagePath = path.join(tempDir, file);
|
||||||
|
const imageData = fs.readFileSync(imagePath);
|
||||||
|
images.push(imageData.toString('base64'));
|
||||||
|
}
|
||||||
|
|
||||||
|
return images;
|
||||||
|
} finally {
|
||||||
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract transactions from images using Ollama with streaming
|
||||||
|
*/
|
||||||
|
async function extractTransactionsStreaming(images: string[]): Promise<ITransaction[]> {
|
||||||
|
const payload = {
|
||||||
|
model: MODEL,
|
||||||
|
prompt: BANK_STATEMENT_PROMPT,
|
||||||
|
images,
|
||||||
|
stream: true,
|
||||||
|
options: {
|
||||||
|
num_predict: 16384,
|
||||||
|
temperature: 0.1,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Ollama API error: ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const reader = response.body?.getReader();
|
||||||
|
if (!reader) {
|
||||||
|
throw new Error('No response body');
|
||||||
|
}
|
||||||
|
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
let fullText = '';
|
||||||
|
let lineBuffer = '';
|
||||||
|
|
||||||
|
// Stream and print output (buffer until newline for cleaner display)
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) break;
|
||||||
|
|
||||||
|
const chunk = decoder.decode(value, { stream: true });
|
||||||
|
const lines = chunk.split('\n').filter((l) => l.trim());
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
try {
|
||||||
|
const json = JSON.parse(line);
|
||||||
|
if (json.response) {
|
||||||
|
fullText += json.response;
|
||||||
|
lineBuffer += json.response;
|
||||||
|
|
||||||
|
// Print complete lines
|
||||||
|
if (lineBuffer.includes('\n')) {
|
||||||
|
const parts = lineBuffer.split('\n');
|
||||||
|
for (let i = 0; i < parts.length - 1; i++) {
|
||||||
|
console.log(parts[i]);
|
||||||
|
}
|
||||||
|
lineBuffer = parts[parts.length - 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Skip invalid JSON lines
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print any remaining buffer
|
||||||
|
if (lineBuffer) {
|
||||||
|
console.log(lineBuffer);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
// Parse JSON from response
|
||||||
|
const startIdx = fullText.indexOf('[');
|
||||||
|
const endIdx = fullText.lastIndexOf(']') + 1;
|
||||||
|
|
||||||
|
if (startIdx < 0 || endIdx <= startIdx) {
|
||||||
|
throw new Error('No JSON array found in response');
|
||||||
|
}
|
||||||
|
|
||||||
|
return JSON.parse(fullText.substring(startIdx, endIdx));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare extracted transactions against expected
|
||||||
|
*/
|
||||||
|
function compareTransactions(
|
||||||
|
extracted: ITransaction[],
|
||||||
|
expected: ITransaction[]
|
||||||
|
): { matches: number; total: number; errors: string[] } {
|
||||||
|
const errors: string[] = [];
|
||||||
|
let matches = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < expected.length; i++) {
|
||||||
|
const exp = expected[i];
|
||||||
|
const ext = extracted[i];
|
||||||
|
|
||||||
|
if (!ext) {
|
||||||
|
errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dateMatch = ext.date === exp.date;
|
||||||
|
const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
|
||||||
|
|
||||||
|
if (dateMatch && amountMatch) {
|
||||||
|
matches++;
|
||||||
|
} else {
|
||||||
|
errors.push(
|
||||||
|
`Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.length > expected.length) {
|
||||||
|
errors.push(`Extra transactions: ${extracted.length - expected.length}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { matches, total: expected.length, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all test cases (PDF + JSON pairs) in .nogit/
|
||||||
|
*/
|
||||||
|
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
|
||||||
|
const testDir = path.join(process.cwd(), '.nogit');
|
||||||
|
if (!fs.existsSync(testDir)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const files = fs.readdirSync(testDir);
|
||||||
|
const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
|
||||||
|
const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
|
||||||
|
|
||||||
|
for (const pdf of pdfFiles) {
|
||||||
|
const baseName = pdf.replace('.pdf', '');
|
||||||
|
const jsonFile = `${baseName}.json`;
|
||||||
|
if (files.includes(jsonFile)) {
|
||||||
|
testCases.push({
|
||||||
|
name: baseName,
|
||||||
|
pdfPath: path.join(testDir, pdf),
|
||||||
|
jsonPath: path.join(testDir, jsonFile),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return testCases;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests
|
||||||
|
|
||||||
|
tap.test('should connect to Ollama API', async () => {
|
||||||
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
||||||
|
expect(response.ok).toBeTrue();
|
||||||
|
const data = await response.json();
|
||||||
|
expect(data.models).toBeArray();
|
||||||
|
});
|
||||||
|
|
||||||
|
tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
|
||||||
|
const response = await fetch(`${OLLAMA_URL}/api/tags`);
|
||||||
|
const data = await response.json();
|
||||||
|
const modelNames = data.models.map((m: { name: string }) => m.name);
|
||||||
|
expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Dynamic test for each PDF/JSON pair
|
||||||
|
const testCases = findTestCases();
|
||||||
|
for (const testCase of testCases) {
|
||||||
|
tap.test(`should extract transactions from ${testCase.name}`, async () => {
|
||||||
|
// Load expected transactions
|
||||||
|
const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
|
||||||
|
console.log(`\n=== ${testCase.name} ===`);
|
||||||
|
console.log(`Expected: ${expected.length} transactions`);
|
||||||
|
|
||||||
|
// Convert PDF to images
|
||||||
|
console.log('Converting PDF to images...');
|
||||||
|
const images = convertPdfToImages(testCase.pdfPath);
|
||||||
|
console.log(`Converted: ${images.length} pages`);
|
||||||
|
|
||||||
|
// Extract transactions with streaming output
|
||||||
|
console.log('Extracting transactions (streaming)...\n');
|
||||||
|
const extracted = await extractTransactionsStreaming(images);
|
||||||
|
console.log(`Extracted: ${extracted.length} transactions`);
|
||||||
|
|
||||||
|
// Compare results
|
||||||
|
const result = compareTransactions(extracted, expected);
|
||||||
|
console.log(`Matches: ${result.matches}/${result.total}`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('Errors:');
|
||||||
|
result.errors.forEach((e) => console.log(` - ${e}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert high accuracy
|
||||||
|
const accuracy = result.matches / result.total;
|
||||||
|
expect(accuracy).toBeGreaterThan(0.95);
|
||||||
|
expect(extracted.length).toEqual(expected.length);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export default tap.start();
|
||||||
13
tsconfig.json
Normal file
13
tsconfig.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "NodeNext",
|
||||||
|
"moduleResolution": "NodeNext",
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"strict": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"outDir": "./dist_ts",
|
||||||
|
"declaration": true
|
||||||
|
},
|
||||||
|
"include": ["ts/**/*", "test/**/*"]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user