feat(tests): integrate SmartAi/DualAgentOrchestrator into extraction tests and add JSON self-validation

2026-01-20 01:17:41 +00:00
parent b202e024a4
commit 77d57e80bd
7 changed files with 562 additions and 575 deletions
--- a/test/test.invoices.minicpm.ts
+++ b/test/test.invoices.minicpm.ts
@@ -1,10 +1,10 @@
 /**
- * Invoice extraction test using MiniCPM-V (visual extraction)
+ * Invoice extraction test using MiniCPM-V via smartagent DualAgentOrchestrator
 *
- * Consensus approach:
- * 1. Pass 1: Fast JSON extraction
- * 2. Pass 2: Confirm with thinking enabled
- * 3. If mismatch: repeat until consensus or max attempts
+ * Uses vision-capable orchestrator with JsonValidatorTool for self-validation:
+ * 1. Pass images to the orchestrator
+ * 2. Driver extracts invoice data and validates JSON before completing
+ * 3. If validation fails, driver retries within the same task
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
@@ -12,6 +12,8 @@ import * as path from 'path';
 import { execSync } from 'child_process';
 import * as os from 'os';
 import { ensureMiniCpm } from './helpers/docker.js';
+import { SmartAi } from '@push.rocks/smartai';
+import { DualAgentOrchestrator, JsonValidatorTool } from '@push.rocks/smartagent';

 const OLLAMA_URL = 'http://localhost:11434';
 const MODEL = 'openbmb/minicpm-v4.5:q8_0';
@@ -26,6 +28,10 @@ interface IInvoice {
  total_amount: number;
 }

+// SmartAi instance and orchestrator (initialized in setup)
+let smartAi: SmartAi;
+let orchestrator: DualAgentOrchestrator;
+
 /**
 * Convert PDF to PNG images using ImageMagick
 */
@@ -54,7 +60,9 @@ function convertPdfToImages(pdfPath: string): string[] {
  }
 }

-const JSON_PROMPT = `Extract invoice data from this image. Return ONLY a JSON object with these exact fields:
+const EXTRACTION_PROMPT = `Extract invoice data from the provided image(s).
+
+IMPORTANT: You must output a valid JSON object with these exact fields:
 {
  "invoice_number": "the invoice number (not VAT ID, not customer ID)",
  "invoice_date": "YYYY-MM-DD format",
@@ -64,150 +72,16 @@ const JSON_PROMPT = `Extract invoice data from this image. Return ONLY a JSON ob
  "vat_amount": 0.00,
  "total_amount": 0.00
 }
-Return only the JSON, no explanation.`;

-/**
- * Query MiniCPM-V for JSON output (fast, no thinking) with streaming
- */
-async function queryJsonFast(images: string[]): Promise<string> {
-  const startTime = Date.now();
-  process.stdout.write(`      [Fast] `);
+Before completing, use the json.validate tool to verify your output is valid JSON with all required fields.

-  const response = await fetch(`${OLLAMA_URL}/api/chat`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({
-      model: MODEL,
-      messages: [{
-        role: 'user',
-        content: JSON_PROMPT,
-        images: images,
-      }],
-      stream: true,
-      options: {
-        num_ctx: 32768,
-        num_predict: 1000,
-        temperature: 0.1,
-      },
-    }),
-  });
+<tool_call>
+  <tool>json</tool>
+  <action>validate</action>
+  <params>{"jsonString": "YOUR_JSON_HERE", "requiredFields": ["invoice_number", "invoice_date", "vendor_name", "currency", "net_amount", "vat_amount", "total_amount"]}</params>
+</tool_call>

-  if (!response.ok) {
-    throw new Error(`Ollama API error: ${response.status}`);
-  }
-
-  let content = '';
-  const reader = response.body!.getReader();
-  const decoder = new TextDecoder();
-
-  try {
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-
-      const chunk = decoder.decode(value, { stream: true });
-      for (const line of chunk.split('\n').filter(l => l.trim())) {
-        try {
-          const json = JSON.parse(line);
-          const token = json.message?.content || '';
-          if (token) {
-            process.stdout.write(token);
-            content += token;
-          }
-        } catch {
-          // Ignore parse errors for partial chunks
-        }
-      }
-    }
-  } finally {
-    const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
-    process.stdout.write(` (${elapsed}s)\n`);
-  }
-
-  return content.trim();
-}
-
-/**
- * Query MiniCPM-V for JSON output with thinking enabled (slower, more accurate) with streaming
- */
-async function queryJsonWithThinking(images: string[]): Promise<string> {
-  const startTime = Date.now();
-  process.stdout.write(`      [Think] `);
-
-  const response = await fetch(`${OLLAMA_URL}/api/chat`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({
-      model: MODEL,
-      messages: [{
-        role: 'user',
-        content: `Think carefully about this invoice image, then ${JSON_PROMPT}`,
-        images: images,
-      }],
-      stream: true,
-      options: {
-        num_ctx: 32768,
-        num_predict: 2000,
-        temperature: 0.1,
-      },
-    }),
-  });
-
-  if (!response.ok) {
-    throw new Error(`Ollama API error: ${response.status}`);
-  }
-
-  let content = '';
-  let thinkingContent = '';
-  let thinkingStarted = false;
-  let outputStarted = false;
-  const reader = response.body!.getReader();
-  const decoder = new TextDecoder();
-
-  try {
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-
-      const chunk = decoder.decode(value, { stream: true });
-      for (const line of chunk.split('\n').filter(l => l.trim())) {
-        try {
-          const json = JSON.parse(line);
-
-          // Stream thinking tokens
-          const thinking = json.message?.thinking || '';
-          if (thinking) {
-            if (!thinkingStarted) {
-              process.stdout.write(`THINKING: `);
-              thinkingStarted = true;
-            }
-            process.stdout.write(thinking);
-            thinkingContent += thinking;
-          }
-
-          // Stream content tokens
-          const token = json.message?.content || '';
-          if (token) {
-            if (!outputStarted) {
-              if (thinkingStarted) process.stdout.write('\n      [Think] ');
-              process.stdout.write(`OUTPUT: `);
-              outputStarted = true;
-            }
-            process.stdout.write(token);
-            content += token;
-          }
-        } catch {
-          // Ignore parse errors for partial chunks
-        }
-      }
-    }
-  } finally {
-    const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
-    process.stdout.write(` (${elapsed}s)\n`);
-  }
-
-  return content.trim();
-}
+Only complete the task after validation passes. Output the final JSON in <task_complete> tags.`;

 /**
 * Parse amount from string (handles European format)
@@ -273,9 +147,31 @@ function extractCurrency(s: string | undefined): string {
 }

 /**
- * Extract JSON from response (handles markdown code blocks)
+ * Extract JSON from response (handles markdown code blocks and task_complete tags)
 */
 function extractJsonFromResponse(response: string): Record<string, unknown> | null {
+  // Try to find JSON in task_complete tags
+  const completeMatch = response.match(/<task_complete>([\s\S]*?)<\/task_complete>/);
+  if (completeMatch) {
+    const content = completeMatch[1].trim();
+    // Try to find JSON in the content
+    const codeBlockMatch = content.match(/```(?:json)?\s*([\s\S]*?)```/);
+    const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : content;
+    try {
+      return JSON.parse(jsonStr);
+    } catch {
+      // Try to find JSON object pattern
+      const jsonMatch = jsonStr.match(/\{[\s\S]*\}/);
+      if (jsonMatch) {
+        try {
+          return JSON.parse(jsonMatch[0]);
+        } catch {
+          return null;
+        }
+      }
+    }
+  }
+
  // Try to find JSON in markdown code block
  const codeBlockMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/);
  const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim();
@@ -315,76 +211,27 @@ function parseJsonToInvoice(response: string): IInvoice | null {
 }

 /**
- * Compare two invoices for consensus (key fields must match)
- */
-function invoicesMatch(a: IInvoice, b: IInvoice): boolean {
-  const numMatch = a.invoice_number.toLowerCase() === b.invoice_number.toLowerCase();
-  const dateMatch = a.invoice_date === b.invoice_date;
-  const totalMatch = Math.abs(a.total_amount - b.total_amount) < 0.02;
-  return numMatch && dateMatch && totalMatch;
-}
-
-/**
- * Extract invoice data using consensus approach:
- * 1. Pass 1: Fast JSON extraction
- * 2. Pass 2: Confirm with thinking enabled
- * 3. If mismatch: repeat until consensus or max 5 attempts
+ * Extract invoice data using smartagent orchestrator with vision
 */
 async function extractInvoiceFromImages(images: string[]): Promise<IInvoice> {
-  console.log(`    [Vision] Processing ${images.length} page(s) with ${MODEL} (consensus)`);
+  console.log(`    [Vision] Processing ${images.length} page(s) with smartagent DualAgentOrchestrator`);

-  const MAX_ATTEMPTS = 5;
-  let attempt = 0;
+  const startTime = Date.now();

-  while (attempt < MAX_ATTEMPTS) {
-    attempt++;
-    console.log(`    [Attempt ${attempt}/${MAX_ATTEMPTS}]`);
+  const result = await orchestrator.run(EXTRACTION_PROMPT, { images });

-    // PASS 1: Fast JSON extraction
-    console.log(`    [Pass 1] Fast extraction...`);
-    const fastResponse = await queryJsonFast(images);
-    const fastInvoice = parseJsonToInvoice(fastResponse);
+  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
+  console.log(`    [Vision] Completed in ${elapsed}s (${result.iterations} iterations, status: ${result.status})`);

-    if (!fastInvoice) {
-      console.log(`    [Pass 1] JSON parsing failed, retrying...`);
-      continue;
-    }
-    console.log(`    [Pass 1] Result: ${fastInvoice.invoice_number} | ${fastInvoice.invoice_date} | ${fastInvoice.total_amount} ${fastInvoice.currency}`);
+  const invoice = parseJsonToInvoice(result.result);

-    // PASS 2: Confirm with thinking
-    console.log(`    [Pass 2] Thinking confirmation...`);
-    const thinkResponse = await queryJsonWithThinking(images);
-    const thinkInvoice = parseJsonToInvoice(thinkResponse);
-
-    if (!thinkInvoice) {
-      console.log(`    [Pass 2] JSON parsing failed, retrying...`);
-      continue;
-    }
-    console.log(`    [Pass 2] Result: ${thinkInvoice.invoice_number} | ${thinkInvoice.invoice_date} | ${thinkInvoice.total_amount} ${thinkInvoice.currency}`);
-
-    // Check consensus
-    if (invoicesMatch(fastInvoice, thinkInvoice)) {
-      console.log(`    [Consensus] MATCH - using result`);
-      return thinkInvoice; // Prefer thinking result
-    }
-
-    console.log(`    [Consensus] MISMATCH - repeating...`);
-    console.log(`      Fast:  ${fastInvoice.invoice_number} | ${fastInvoice.invoice_date} | ${fastInvoice.total_amount}`);
-    console.log(`      Think: ${thinkInvoice.invoice_number} | ${thinkInvoice.invoice_date} | ${thinkInvoice.total_amount}`);
+  if (invoice) {
+    console.log(`    [Result] ${invoice.invoice_number} | ${invoice.invoice_date} | ${invoice.total_amount} ${invoice.currency}`);
+    return invoice;
  }

-  // Max attempts reached - do one final thinking pass and use that
-  console.log(`    [Final] Max attempts reached, using final thinking pass`);
-  const finalResponse = await queryJsonWithThinking(images);
-  const finalInvoice = parseJsonToInvoice(finalResponse);
-
-  if (finalInvoice) {
-    console.log(`    [Final] Result: ${finalInvoice.invoice_number} | ${finalInvoice.invoice_date} | ${finalInvoice.total_amount} ${finalInvoice.currency}`);
-    return finalInvoice;
-  }
-
-  // Return empty invoice if all else fails
-  console.log(`    [Final] All parsing failed, returning empty`);
+  // Return empty invoice if parsing failed
+  console.log(`    [Result] Parsing failed, returning empty invoice`);
  return {
    invoice_number: '',
    invoice_date: '',
@@ -493,6 +340,79 @@ tap.test('setup: ensure Docker containers are running', async () => {
  console.log('\n[Setup] All containers ready!\n');
 });

+tap.test('setup: initialize smartagent orchestrator', async () => {
+  console.log('[Setup] Initializing SmartAi and DualAgentOrchestrator...');
+
+  smartAi = new SmartAi({
+    ollama: {
+      baseUrl: OLLAMA_URL,
+      model: MODEL,
+      defaultOptions: {
+        num_ctx: 32768,
+        temperature: 0.1,
+      },
+      defaultTimeout: 300000, // 5 minutes for vision tasks
+    },
+  });
+
+  await smartAi.start();
+
+  orchestrator = new DualAgentOrchestrator({
+    smartAiInstance: smartAi,
+    defaultProvider: 'ollama',
+    guardianPolicyPrompt: `You are a Guardian agent overseeing invoice extraction tasks.
+
+APPROVE all tool calls that:
+- Use the json.validate action to verify JSON output
+- Are reasonable attempts to complete the extraction task
+
+REJECT tool calls that:
+- Attempt to access external resources
+- Try to execute arbitrary code
+- Are clearly unrelated to invoice extraction`,
+    driverSystemMessage: `You are an AI assistant that extracts invoice data from images.
+
+Your task is to analyze invoice images and extract structured data.
+You have access to a json.validate tool to verify your JSON output.
+
+IMPORTANT: Always validate your JSON before completing the task.
+
+## Tool Usage Format
+When you need to validate JSON, output:
+
+<tool_call>
+  <tool>json</tool>
+  <action>validate</action>
+  <params>{"jsonString": "YOUR_JSON", "requiredFields": ["invoice_number", "invoice_date", "vendor_name", "currency", "net_amount", "vat_amount", "total_amount"]}</params>
+</tool_call>
+
+## Completion Format
+After validation passes, complete the task:
+
+<task_complete>
+{"invoice_number": "...", "invoice_date": "YYYY-MM-DD", ...}
+</task_complete>`,
+    maxIterations: 5,
+    maxConsecutiveRejections: 3,
+    onToken: (token, source) => {
+      if (source === 'driver') {
+        process.stdout.write(token);
+      }
+    },
+    onProgress: (event) => {
+      if (event.logLevel === 'error') {
+        console.error(event.logMessage);
+      }
+    },
+  });
+
+  // Register the JsonValidatorTool
+  orchestrator.registerTool(new JsonValidatorTool());
+
+  await orchestrator.start();
+  console.log('[Setup] Orchestrator initialized!\n');
+});
+
 tap.test('should have MiniCPM-V model loaded', async () => {
  const response = await fetch(`${OLLAMA_URL}/api/tags`);
  const data = await response.json();
@@ -501,7 +421,7 @@ tap.test('should have MiniCPM-V model loaded', async () => {
 });

 const testCases = findTestCases();
-console.log(`\nFound ${testCases.length} invoice test cases (MiniCPM-V)\n`);
+console.log(`\nFound ${testCases.length} invoice test cases (smartagent + MiniCPM-V)\n`);

 let passedCount = 0;
 let failedCount = 0;
@@ -538,6 +458,13 @@ for (const testCase of testCases) {
  });
 }

+tap.test('cleanup: stop orchestrator', async () => {
+  if (orchestrator) {
+    await orchestrator.stop();
+  }
+  console.log('[Cleanup] Orchestrator stopped');
+});
+
 tap.test('summary', async () => {
  const totalInvoices = testCases.length;
  const accuracy = totalInvoices > 0 ? (passedCount / totalInvoices) * 100 : 0;
@@ -545,9 +472,10 @@ tap.test('summary', async () => {
  const avgTimeSec = processingTimes.length > 0 ? totalTimeMs / processingTimes.length / 1000 : 0;

  console.log(`\n========================================`);
-  console.log(`   Invoice Extraction Summary (${MODEL})`);
+  console.log(`   Invoice Extraction Summary`);
+  console.log(`   (smartagent + ${MODEL})`);
  console.log(`========================================`);
-  console.log(`  Method:    Consensus (fast + thinking)`);
+  console.log(`  Method:    DualAgentOrchestrator with vision`);
  console.log(`  Passed:    ${passedCount}/${totalInvoices}`);
  console.log(`  Failed:    ${failedCount}/${totalInvoices}`);
  console.log(`  Accuracy:  ${accuracy.toFixed(1)}%`);