ht-docker-ai/test/test.bankstatements.paddleocr-vl.ts

/**
 * Bank statement extraction test using PaddleOCR-VL Full Pipeline
 *
 * This tests the complete PaddleOCR-VL pipeline for bank statements:
 *   1. PP-DocLayoutV2 for layout detection
 *   2. PaddleOCR-VL for recognition (tables with proper structure)
 *   3. Structured Markdown output with tables
 *   4. MiniCPM extracts transactions from structured tables
 *
 * The structured Markdown has properly formatted tables,
 * making it much easier for MiniCPM to extract transaction data.
 */
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
import * as os from 'os';
import { ensurePaddleOcrVlFull, ensureMiniCpm } from './helpers/docker.js';

const PADDLEOCR_VL_URL = 'http://localhost:8000';
const OLLAMA_URL = 'http://localhost:11434';
const MINICPM_MODEL = 'minicpm-v:latest';

interface ITransaction {
  date: string;
  counterparty: string;
  amount: number;
}

/**
 * Convert PDF to PNG images using ImageMagick
 */
function convertPdfToImages(pdfPath: string): string[] {
  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-convert-'));
  const outputPattern = path.join(tempDir, 'page-%d.png');

  try {
    execSync(
      `convert -density 300 -quality 100 "${pdfPath}" -background white -alpha remove "${outputPattern}"`,
      { stdio: 'pipe' }
    );

    const files = fs.readdirSync(tempDir).filter((f: string) => f.endsWith('.png')).sort();
    const images: string[] = [];

    for (const file of files) {
      const imagePath = path.join(tempDir, file);
      const imageData = fs.readFileSync(imagePath);
      images.push(imageData.toString('base64'));
    }

    return images;
  } finally {
    fs.rmSync(tempDir, { recursive: true, force: true });
  }
}

/**
 * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
 */
async function parseDocument(imageBase64: string): Promise<string> {
  const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      image: imageBase64,
      output_format: 'markdown',
    }),
  });

  if (!response.ok) {
    const text = await response.text();
    throw new Error(`PaddleOCR-VL API error: ${response.status} - ${text}`);
  }

  const data = await response.json();

  if (!data.success) {
    throw new Error(`PaddleOCR-VL error: ${data.error}`);
  }

  return data.result?.markdown || '';
}

/**
 * Extract transactions from structured Markdown using MiniCPM
 */
async function extractTransactionsFromMarkdown(markdown: string): Promise<ITransaction[]> {
  console.log(`    [Extract] Processing ${markdown.length} chars of Markdown`);

  const prompt = `/nothink
Convert this bank statement to a JSON array of transactions.

Read the Amount values carefully:
- "- 21,47 €" means DEBIT, output as: -21.47
- "+ 1.000,00 €" means CREDIT, output as: 1000.00
- European format: comma = decimal point, dot = thousands

For each transaction output: {"date":"YYYY-MM-DD","counterparty":"NAME","amount":-21.47}

Return ONLY the JSON array, no explanation.

Document:
${markdown}`;

  const payload = {
    model: MINICPM_MODEL,
    prompt,
    stream: true,
    options: {
      num_predict: 16384,
      temperature: 0.1,
    },
  };

  const response = await fetch(`${OLLAMA_URL}/api/generate`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(payload),
  });

  if (!response.ok) {
    throw new Error(`Ollama API error: ${response.status}`);
  }

  const reader = response.body?.getReader();
  if (!reader) {
    throw new Error('No response body');
  }

  const decoder = new TextDecoder();
  let fullText = '';

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    const chunk = decoder.decode(value, { stream: true });
    const lines = chunk.split('\n').filter((l) => l.trim());

    for (const line of lines) {
      try {
        const json = JSON.parse(line);
        if (json.response) {
          fullText += json.response;
        }
      } catch {
        // Skip invalid JSON lines
      }
    }
  }

  // Extract JSON array from response
  const startIdx = fullText.indexOf('[');
  const endIdx = fullText.lastIndexOf(']') + 1;

  if (startIdx < 0 || endIdx <= startIdx) {
    throw new Error(`No JSON array found in response: ${fullText.substring(0, 200)}`);
  }

  const jsonStr = fullText.substring(startIdx, endIdx);
  return JSON.parse(jsonStr);
}

/**
 * Extract transactions from all pages of a bank statement
 */
async function extractAllTransactions(images: string[]): Promise<ITransaction[]> {
  const allTransactions: ITransaction[] = [];

  for (let i = 0; i < images.length; i++) {
    console.log(`  Processing page ${i + 1}/${images.length}...`);

    // Parse with full pipeline
    const markdown = await parseDocument(images[i]);
    console.log(`    [Parse] Got ${markdown.split('\n').length} lines of Markdown`);

    // Extract transactions
    try {
      const transactions = await extractTransactionsFromMarkdown(markdown);
      console.log(`    [Extracted] ${transactions.length} transactions`);
      allTransactions.push(...transactions);
    } catch (err) {
      console.log(`    [Error] ${err}`);
    }
  }

  return allTransactions;
}

/**
 * Compare transactions - find matching transaction in expected list
 */
function findMatchingTransaction(
  tx: ITransaction,
  expectedList: ITransaction[]
): ITransaction | undefined {
  return expectedList.find((exp) => {
    const dateMatch = tx.date === exp.date;
    const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02;
    const counterpartyMatch =
      tx.counterparty?.toLowerCase().includes(exp.counterparty?.toLowerCase().slice(0, 10)) ||
      exp.counterparty?.toLowerCase().includes(tx.counterparty?.toLowerCase().slice(0, 10));
    return dateMatch && amountMatch && counterpartyMatch;
  });
}

/**
 * Calculate extraction accuracy
 */
function calculateAccuracy(
  extracted: ITransaction[],
  expected: ITransaction[]
): { matched: number; total: number; accuracy: number } {
  let matched = 0;
  const usedExpected = new Set<number>();

  for (const tx of extracted) {
    for (let i = 0; i < expected.length; i++) {
      if (usedExpected.has(i)) continue;

      const exp = expected[i];
      const dateMatch = tx.date === exp.date;
      const amountMatch = Math.abs(tx.amount - exp.amount) < 0.02;

      if (dateMatch && amountMatch) {
        matched++;
        usedExpected.add(i);
        break;
      }
    }
  }

  return {
    matched,
    total: expected.length,
    accuracy: expected.length > 0 ? (matched / expected.length) * 100 : 0,
  };
}

/**
 * Find all test cases (PDF + JSON pairs) in .nogit/bankstatements/
 */
function findTestCases(): Array<{ name: string; pdfPath: string; jsonPath: string }> {
  const testDir = path.join(process.cwd(), '.nogit/bankstatements');
  if (!fs.existsSync(testDir)) {
    return [];
  }

  const files = fs.readdirSync(testDir);
  const pdfFiles = files.filter((f) => f.endsWith('.pdf'));
  const testCases: Array<{ name: string; pdfPath: string; jsonPath: string }> = [];

  for (const pdf of pdfFiles) {
    const baseName = pdf.replace('.pdf', '');
    const jsonFile = `${baseName}.json`;
    if (files.includes(jsonFile)) {
      testCases.push({
        name: baseName,
        pdfPath: path.join(testDir, pdf),
        jsonPath: path.join(testDir, jsonFile),
      });
    }
  }

  testCases.sort((a, b) => a.name.localeCompare(b.name));
  return testCases;
}

// Tests

tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] Checking Docker containers...\n');

  // Ensure PaddleOCR-VL Full Pipeline is running
  const paddleOk = await ensurePaddleOcrVlFull();
  expect(paddleOk).toBeTrue();

  // Ensure MiniCPM is running (for field extraction from Markdown)
  const minicpmOk = await ensureMiniCpm();
  expect(minicpmOk).toBeTrue();

  console.log('\n[Setup] All containers ready!\n');
});

// Dynamic test for each PDF/JSON pair
const testCases = findTestCases();
console.log(`\nFound ${testCases.length} bank statement test cases (PaddleOCR-VL Full Pipeline)\n`);

const results: Array<{ name: string; accuracy: number; matched: number; total: number }> = [];

for (const testCase of testCases) {
  tap.test(`should extract bank statement: ${testCase.name}`, async () => {
    // Load expected data
    const expected: ITransaction[] = JSON.parse(fs.readFileSync(testCase.jsonPath, 'utf-8'));
    console.log(`\n=== ${testCase.name} ===`);
    console.log(`Expected: ${expected.length} transactions`);

    const startTime = Date.now();

    // Convert PDF to images
    const images = convertPdfToImages(testCase.pdfPath);
    console.log(`  Pages: ${images.length}`);

    // Extract all transactions
    const extracted = await extractAllTransactions(images);

    const endTime = Date.now();
    const elapsedMs = endTime - startTime;

    // Calculate accuracy
    const accuracy = calculateAccuracy(extracted, expected);
    results.push({
      name: testCase.name,
      accuracy: accuracy.accuracy,
      matched: accuracy.matched,
      total: accuracy.total,
    });

    console.log(`  Extracted: ${extracted.length} transactions`);
    console.log(`  Matched: ${accuracy.matched}/${accuracy.total} (${accuracy.accuracy.toFixed(1)}%)`);
    console.log(`  Time: ${(elapsedMs / 1000).toFixed(1)}s`);

    // We expect at least 50% accuracy
    expect(accuracy.accuracy).toBeGreaterThan(50);
  });
}

tap.test('summary', async () => {
  const totalStatements = results.length;
  const avgAccuracy =
    results.length > 0 ? results.reduce((a, b) => a + b.accuracy, 0) / results.length : 0;
  const totalMatched = results.reduce((a, b) => a + b.matched, 0);
  const totalExpected = results.reduce((a, b) => a + b.total, 0);

  console.log(`\n======================================================`);
  console.log(`  Bank Statement Extraction Summary (PaddleOCR-VL Full)`);
  console.log(`======================================================`);
  console.log(`  Method:      PaddleOCR-VL Full Pipeline -> MiniCPM`);
  console.log(`  Statements:  ${totalStatements}`);
  console.log(`  Transactions: ${totalMatched}/${totalExpected} matched`);
  console.log(`  Avg accuracy: ${avgAccuracy.toFixed(1)}%`);
  console.log(`======================================================\n`);
});

export default tap.start();