update

2026-01-16 03:58:39 +00:00
parent 6e464cb7e7
commit 3dc1881d8b
6 changed files with 8815 additions and 1 deletions
--- a/assets/enable-nvidia-host.sh
+++ b/assets/enable-nvidia-host.sh
--- a/package.json
+++ b/package.json
@@ -1,6 +1,7 @@
 {
  "name": "@host.today/ht-docker-ai",
  "version": "1.0.0",
+  "type": "module",
  "private": false,
  "description": "Docker images for AI vision-language models including MiniCPM-V 4.5",
  "main": "dist_ts/index.js",
@@ -9,7 +10,11 @@
  "license": "MIT",
  "scripts": {
    "build": "./build-images.sh",
-    "test": "./test-images.sh"
+    "test": "tstest test/ --verbose"
+  },
+  "devDependencies": {
+    "@git.zone/tstest": "^1.0.90",
+    "@git.zone/tsrun": "^1.3.3"
  },
  "repository": {
    "type": "git",
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/recipes/document.md
+++ b/recipes/document.md
@@ -0,0 +1,129 @@
+# Bank Statement Parsing with MiniCPM-V 4.5
+
+Recipe for extracting transactions from bank statement PDFs using vision-language AI.
+
+## Model
+
+- **Model**: MiniCPM-V 4.5 (8B parameters)
+- **Ollama Name**: `openbmb/minicpm-v4.5:q8_0`
+- **Quantization**: Q8_0 (9.8GB VRAM)
+- **Runtime**: Ollama on GPU
+
+## Image Conversion
+
+Convert PDF to PNG at 300 DPI for optimal OCR accuracy.
+
+```bash
+convert -density 300 -quality 100 input.pdf \
+  -background white -alpha remove \
+  output-%d.png
+```
+
+**Parameters:**
+- `-density 300`: 300 DPI resolution (critical for accuracy)
+- `-quality 100`: Maximum quality
+- `-background white -alpha remove`: Remove transparency
+- `output-%d.png`: Outputs page-0.png, page-1.png, etc.
+
+**Dependencies:**
+```bash
+apt-get install imagemagick
+```
+
+## Prompt
+
+```
+You are a bank statement parser. Extract EVERY transaction from the table.
+
+Read the Amount column carefully:
+- "- 21,47 €" means DEBIT, output as: -21.47
+- "+ 1.000,00 €" means CREDIT, output as: 1000.00
+- European format: comma = decimal point
+
+For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
+
+Do not skip any rows. Return complete JSON array:
+```
+
+## API Call
+
+```python
+import base64
+import requests
+
+# Load images
+with open('page-0.png', 'rb') as f:
+    page0 = base64.b64encode(f.read()).decode('utf-8')
+with open('page-1.png', 'rb') as f:
+    page1 = base64.b64encode(f.read()).decode('utf-8')
+
+payload = {
+    "model": "openbmb/minicpm-v4.5:q8_0",
+    "prompt": prompt,
+    "images": [page0, page1],  # Multiple pages supported
+    "stream": False,
+    "options": {
+        "num_predict": 16384,
+        "temperature": 0.1
+    }
+}
+
+response = requests.post(
+    'http://localhost:11434/api/generate',
+    json=payload,
+    timeout=600
+)
+
+result = response.json()['response']
+```
+
+## Output Format
+
+```json
+[
+  {"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-21.47},
+  {"date":"2022-04-01","counterparty":"DIGITALOCEAN.COM","amount":-58.06},
+  {"date":"2022-04-12","counterparty":"LOSSLESS GMBH","amount":1000.00}
+]
+```
+
+## Running the Container
+
+**GPU (recommended):**
+```bash
+docker run -d --gpus all -p 11434:11434 \
+  -v ollama-data:/root/.ollama \
+  -e MODEL_NAME="openbmb/minicpm-v4.5:q8_0" \
+  ht-docker-ai:minicpm45v
+```
+
+**CPU (slower):**
+```bash
+docker run -d -p 11434:11434 \
+  -v ollama-data:/root/.ollama \
+  -e MODEL_NAME="openbmb/minicpm-v4.5:q4_0" \
+  ht-docker-ai:minicpm45v-cpu
+```
+
+## Hardware Requirements
+
+| Quantization | VRAM/RAM | Speed |
+|--------------|----------|-------|
+| Q8_0 (GPU)   | 10GB     | Fast  |
+| Q4_0 (CPU)   | 8GB      | Slow  |
+
+## Test Results
+
+| Statement | Pages | Transactions | Accuracy |
+|-----------|-------|--------------|----------|
+| bunq-2022-04 | 2 | 26 | 100% |
+| bunq-2021-06 | 3 | 28 | 100% |
+
+## Tips
+
+1. **DPI matters**: 150 DPI causes missed rows; 300 DPI is optimal
+2. **PNG over JPEG**: PNG preserves text clarity better
+3. **Remove alpha**: Some models struggle with transparency
+4. **Multi-page**: Pass all pages in single request for context
+5. **Temperature 0.1**: Low temperature for consistent output
+6. **European format**: Explicitly explain comma=decimal in prompt
--- a/test/test.node.ts
+++ b/test/test.node.ts
@@ -0,0 +1,253 @@
+import { tap, expect } from '@git.zone/tstest/tapbundle';
+import * as fs from 'fs';
+import * as path from 'path';
+import { execSync } from 'child_process';
+import * as os from 'os';
+
+const OLLAMA_URL = 'http://localhost:11434';
+const MODEL = 'openbmb/minicpm-v4.5:q8_0';
+
+const BANK_STATEMENT_PROMPT = `You are a bank statement parser. Extract EVERY transaction from the table.
+
+Read the Amount column carefully:
+- "- 21,47 €" means DEBIT, output as: -21.47
+- "+ 1.000,00 €" means CREDIT, output as: 1000.00
+- European format: comma = decimal point
+
+For each row output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}
+
+Do not skip any rows. Return complete JSON array:`;
+
+interface ITransaction {
+  date: string;
+  counterparty: string;
+  amount: number;
+}
+
+/**
+ * Convert PDF to PNG images using ImageMagick
+ */
+function convertPdfToImages(pdfPath: string): string[] {
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
+  const outputPattern = path.join(tempDir, 'page-%d.png');
+
+  try {
+    execSync(
+      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
+      { stdio: 'pipe' }
+    );
+
+    const files = fs.readdirSync(tempDir).filter((f) => f.endsWith('.png')).sort();
+    const images: string[] = [];
+
+    for (const file of files) {
+      const imagePath = path.join(tempDir, file);
+      const imageData = fs.readFileSync(imagePath);
+      images.push(imageData.toString('base64'));
+    }
+
+    return images;
+  } finally {
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  }
+}
+
+/**
+ * Extract transactions from images using Ollama with streaming
+ */
+async function extractTransactionsStreaming(images: string[]): Promise<ITransaction[]> {
+  const payload = {
+    model: MODEL,
+    prompt: BANK_STATEMENT_PROMPT,
+    images,
+    stream: true,
+    options: {
+      num_predict: 16384,
+      temperature: 0.1,
+    },
+  };
+
+  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(payload),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Ollama API error: ${response.status}`);
+  }
+
+  const reader = response.body?.getReader();
+  if (!reader) {
+    throw new Error('No response body');
+  }
+
+  const decoder = new TextDecoder();
+  let fullText = '';
+  let lineBuffer = '';
+
+  // Stream and print output (buffer until newline for cleaner display)
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    const chunk = decoder.decode(value, { stream: true });
+    const lines = chunk.split('\n').filter((l) => l.trim());
+
+    for (const line of lines) {
+      try {
+        const json = JSON.parse(line);
+        if (json.response) {
+          fullText += json.response;
+          lineBuffer += json.response;
+
+          // Print complete lines
+          if (lineBuffer.includes('\n')) {
+            const parts = lineBuffer.split('\n');
+            for (let i = 0; i < parts.length - 1; i++) {
+              console.log(parts[i]);
+            }
+            lineBuffer = parts[parts.length - 1];
+          }
+        }
+      } catch {
+        // Skip invalid JSON lines
+      }
+    }
+  }
+
+  // Print any remaining buffer
+  if (lineBuffer) {
+    console.log(lineBuffer);
+  }
+  console.log('');
+
+  // Parse JSON from response
+  const startIdx = fullText.indexOf('[');
+  const endIdx = fullText.lastIndexOf(']') + 1;
+
+  if (startIdx < 0 || endIdx <= startIdx) {
+    throw new Error('No JSON array found in response');
+  }
+
+  return JSON.parse(fullText.substring(startIdx, endIdx));
+}
+
+/**
+ * Compare extracted transactions against expected
+ */
+function compareTransactions(
+  extracted: ITransaction[],
+  expected: ITransaction[]
+): { matches: number; total: number; errors: string[] } {
+  const errors: string[] = [];
+  let matches = 0;
+
+  for (let i = 0; i < expected.length; i++) {
+    const exp = expected[i];
+    const ext = extracted[i];
+
+    if (!ext) {
+      errors.push(`Missing transaction ${i}: ${exp.date} ${exp.counterparty}`);
+      continue;
+    }
+
+    const dateMatch = ext.date === exp.date;
+    const amountMatch = Math.abs(ext.amount - exp.amount) < 0.01;
+
+    if (dateMatch && amountMatch) {
+      matches++;
+    } else {
+      errors.push(
+        `Mismatch at ${i}: expected ${exp.date}/${exp.amount}, got ${ext.date}/${ext.amount}`
+      );
+    }
+  }
+
+  if (extracted.length > expected.length) {
+    errors.push(`Extra transactions: ${extracted.length - expected.length}`);
+  }
+
+  return { matches, total: expected.length, errors };
+}
+
+/**
+ * Find all test cases (PDF + JSON pairs) in .nogit/
+ */
+function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
+  const testDir = path.join(process.cwd(), '.nogit');
+  if (!fs.existsSync(testDir)) {
+    return [];
+  }
+
+  const files = fs.readdirSync(testDir);
+  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
+  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];
+
+  for (const pdf of pdfFiles) {
+    const baseName = pdf.replace('.pdf', '');
+    const jsonFile = `${baseName}.json`;
+    if (files.includes(jsonFile)) {
+      testCases.push({
+        name: baseName,
+        pdfPath: path.join(testDir, pdf),
+        jsonPath: path.join(testDir, jsonFile),
+      });
+    }
+  }
+
+  return testCases;
+}
+
+// Tests
+
+tap.test('should connect to Ollama API', async () => {
+  const response = await fetch(`${OLLAMA_URL}/api/tags`);
+  expect(response.ok).toBeTrue();
+  const data = await response.json();
+  expect(data.models).toBeArray();
+});
+
+tap.test('should have MiniCPM-V 4.5 model loaded', async () => {
+  const response = await fetch(`${OLLAMA_URL}/api/tags`);
+  const data = await response.json();
+  const modelNames = data.models.map((m: { name: string }) => m.name);
+  expect(modelNames.some((name: string) => name.includes('minicpm-v4.5'))).toBeTrue();
+});
+
+// Dynamic test for each PDF/JSON pair
+const testCases = findTestCases();
+for (const testCase of testCases) {
+  tap.test(`should extract transactions from ${testCase.name}`, async () => {
+    // Load expected transactions
+    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
+    console.log(`\n=== ${testCase.name} ===`);
+    console.log(`Expected: ${expected.length} transactions`);
+
+    // Convert PDF to images
+    console.log('Converting PDF to images...');
+    const images = convertPdfToImages(testCase.pdfPath);
+    console.log(`Converted: ${images.length} pages`);
+
+    // Extract transactions with streaming output
+    console.log('Extracting transactions (streaming)...\n');
+    const extracted = await extractTransactionsStreaming(images);
+    console.log(`Extracted: ${extracted.length} transactions`);
+
+    // Compare results
+    const result = compareTransactions(extracted, expected);
+    console.log(`Matches: ${result.matches}/${result.total}`);
+
+    if (result.errors.length > 0) {
+      console.log('Errors:');
+      result.errors.forEach((e) => console.log(`  - ${e}`));
+    }
+
+    // Assert high accuracy
+    const accuracy = result.matches / result.total;
+    expect(accuracy).toBeGreaterThan(0.95);
+    expect(extracted.length).toEqual(expected.length);
+  });
+}
+
+export default tap.start();
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -0,0 +1,13 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "esModuleInterop": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "outDir": "./dist_ts",
+    "declaration": true
+  },
+  "include": ["ts/**/*", "test/**/*"]
+}