2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 21:05:51 +00:00
* Invoice extraction using Nanonets - OCR2 - 3 B + GPT - OSS 20 B ( sequential two - stage pipeline )
2026-01-18 15:54:16 +00:00
*
2026-01-19 21:05:51 +00:00
* Stage 1 : Nanonets - OCR2 - 3 B converts ALL document pages to markdown ( stop after completion )
2026-01-19 11:51:23 +00:00
* Stage 2 : GPT - OSS 20 B extracts structured JSON from saved markdown ( after Nanonets stops )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* This approach avoids GPU contention by running services sequentially .
2026-01-18 15:54:16 +00:00
* /
import { tap , expect } from '@git.zone/tstest/tapbundle' ;
import * as fs from 'fs' ;
import * as path from 'path' ;
import { execSync } from 'child_process' ;
import * as os from 'os' ;
2026-01-18 23:00:24 +00:00
import { ensureNanonetsOcr , ensureMiniCpm , isContainerRunning } from './helpers/docker.js' ;
2026-01-20 00:39:36 +00:00
import { SmartAi } from '@push.rocks/smartai' ;
import { DualAgentOrchestrator } from '@push.rocks/smartagent' ;
2026-01-18 15:54:16 +00:00
const NANONETS_URL = 'http://localhost:8000/v1' ;
2026-01-19 21:05:51 +00:00
const NANONETS_MODEL = 'nanonets/Nanonets-OCR2-3B' ;
2026-01-18 15:54:16 +00:00
const OLLAMA_URL = 'http://localhost:11434' ;
2026-01-19 11:51:23 +00:00
const EXTRACTION_MODEL = 'gpt-oss:20b' ;
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
// Persistent cache directory for storing markdown between runs
const MD_CACHE_DIR = path . join ( process . cwd ( ) , '.nogit/invoices-md' ) ;
// SmartAi instance for Ollama with optimized settings
const smartAi = new SmartAi ( {
ollama : {
baseUrl : OLLAMA_URL ,
model : EXTRACTION_MODEL ,
defaultOptions : {
num_ctx : 32768 , // Larger context for long invoices + thinking
temperature : 0 , // Deterministic for JSON extraction
} ,
defaultTimeout : 600000 , // 10 minute timeout for large documents
} ,
} ) ;
// DualAgentOrchestrator for structured task execution
let orchestrator : DualAgentOrchestrator ;
2026-01-18 23:00:24 +00:00
2026-01-18 15:54:16 +00:00
interface IInvoice {
invoice_number : string ;
invoice_date : string ;
vendor_name : string ;
currency : string ;
net_amount : number ;
vat_amount : number ;
total_amount : number ;
}
2026-01-19 11:51:23 +00:00
interface IImageData {
base64 : string ;
width : number ;
height : number ;
pageNum : number ;
}
2026-01-18 23:00:24 +00:00
interface ITestCase {
name : string ;
pdfPath : string ;
jsonPath : string ;
markdownPath? : string ;
}
2026-01-18 15:54:16 +00:00
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = ` Extract the text from the above document as if you were reading it naturally.
Return the tables in html format .
Return the equations in LaTeX representation .
If there is an image in the document and image caption is not present , add a small description inside < img > < / img > tag .
Watermarks should be wrapped in brackets . Ex : < watermark > OFFICIAL COPY < / watermark > .
Page numbers should be wrapped in brackets . Ex : < page_number > 14 < / page_number > . ` ;
2026-01-19 21:19:37 +00:00
// JSON extraction prompt for GPT-OSS 20B (sent AFTER the invoice text is provided)
const JSON_EXTRACTION_PROMPT = ` Extract key fields from the invoice. Return ONLY valid JSON.
2026-01-18 15:54:16 +00:00
2026-01-19 21:19:37 +00:00
WHERE TO FIND DATA :
- invoice_number , invoice_date , vendor_name : Look in the HEADER section at the TOP of PAGE 1 ( near "Invoice no." , "Invoice date:" , "Rechnungsnummer" )
- net_amount , vat_amount , total_amount : Look in the SUMMARY section at the BOTTOM ( look for "Total" , "Amount due" , "Gesamtbetrag" )
RULES :
1 . invoice_number : Extract ONLY the value ( e . g . , "R0015632540" ) , NOT the label "Invoice no."
2 . invoice_date : Convert to YYYY - MM - DD format ( e . g . , "14/04/2022" → "2022-04-14" )
3 . vendor_name : The company issuing the invoice
2026-01-18 15:54:16 +00:00
4 . currency : EUR , USD , or GBP
2026-01-19 21:19:37 +00:00
5 . net_amount : Total before tax
6 . vat_amount : Tax amount
7 . total_amount : Final total with tax
2026-01-18 15:54:16 +00:00
2026-01-19 21:19:37 +00:00
JSON only :
2026-01-20 00:39:36 +00:00
{ "invoice_number" : "X" , "invoice_date" : "YYYY-MM-DD" , "vendor_name" : "X" , "currency" : "EUR" , "net_amount" : 0 , "vat_amount" : 0 , "total_amount" : 0 }
Double check for valid JSON syntax .
` ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
// Constants for smart batching
const PATCH_SIZE = 14 ; // Qwen2.5-VL uses 14x14 patches
/ * *
* Estimate visual tokens for an image based on dimensions
* /
function estimateVisualTokens ( width : number , height : number ) : number {
return Math . ceil ( ( width * height ) / ( PATCH_SIZE * PATCH_SIZE ) ) ;
}
/ * *
2026-01-19 21:05:51 +00:00
* Process images one page at a time for reliability
2026-01-19 11:51:23 +00:00
* /
function batchImages ( images : IImageData [ ] ) : IImageData [ ] [ ] {
2026-01-19 21:05:51 +00:00
// One page per batch for reliable processing
return images . map ( img = > [ img ] ) ;
2026-01-19 11:51:23 +00:00
}
2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Convert PDF to JPEG images using ImageMagick with dimension tracking
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
function convertPdfToImages ( pdfPath : string ) : IImageData [ ] {
2026-01-18 15:54:16 +00:00
const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'pdf-convert-' ) ) ;
2026-01-19 11:51:23 +00:00
const outputPattern = path . join ( tempDir , 'page-%d.jpg' ) ;
2026-01-18 15:54:16 +00:00
try {
execSync (
` convert -density 150 -quality 90 " ${ pdfPath } " -background white -alpha remove " ${ outputPattern } " ` ,
{ stdio : 'pipe' }
) ;
2026-01-19 11:51:23 +00:00
const files = fs . readdirSync ( tempDir ) . filter ( ( f : string ) = > f . endsWith ( '.jpg' ) ) . sort ( ) ;
const images : IImageData [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
for ( let i = 0 ; i < files . length ; i ++ ) {
const file = files [ i ] ;
2026-01-18 15:54:16 +00:00
const imagePath = path . join ( tempDir , file ) ;
const imageData = fs . readFileSync ( imagePath ) ;
2026-01-19 11:51:23 +00:00
// Get image dimensions using identify command
const dimensions = execSync ( ` identify -format "%w %h" " ${ imagePath } " ` , { encoding : 'utf-8' } ) . trim ( ) ;
const [ width , height ] = dimensions . split ( ' ' ) . map ( Number ) ;
images . push ( {
base64 : imageData.toString ( 'base64' ) ,
width ,
height ,
pageNum : i + 1 ,
} ) ;
2026-01-18 15:54:16 +00:00
}
return images ;
} finally {
fs . rmSync ( tempDir , { recursive : true , force : true } ) ;
}
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert a batch of pages to markdown using Nanonets - OCR - s
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertBatchToMarkdown ( batch : IImageData [ ] ) : Promise < string > {
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-19 11:51:23 +00:00
const pageNums = batch . map ( img = > img . pageNum ) . join ( ', ' ) ;
// Build content array with all images first, then the prompt
const content : Array < { type : string ; image_url ? : { url : string } ; text? : string } > = [ ] ;
for ( const img of batch ) {
content . push ( {
type : 'image_url' ,
image_url : { url : ` data:image/jpeg;base64, ${ img . base64 } ` } ,
} ) ;
}
// Add prompt with page separator instruction if multiple pages
const promptText = batch . length > 1
? ` ${ NANONETS_OCR_PROMPT } \ n \ nPlease clearly separate each page's content with "--- PAGE N ---" markers, where N is the page number starting from ${ batch [ 0 ] . pageNum } . `
: NANONETS_OCR_PROMPT ;
content . push ( { type : 'text' , text : promptText } ) ;
2026-01-18 15:54:16 +00:00
const response = await fetch ( ` ${ NANONETS_URL } /chat/completions ` , {
method : 'POST' ,
headers : {
'Content-Type' : 'application/json' ,
'Authorization' : 'Bearer dummy' ,
} ,
body : JSON.stringify ( {
model : NANONETS_MODEL ,
messages : [ {
role : 'user' ,
2026-01-19 11:51:23 +00:00
content ,
2026-01-18 15:54:16 +00:00
} ] ,
2026-01-19 11:51:23 +00:00
max_tokens : 4096 * batch . length , // Scale output tokens with batch size
2026-01-18 15:54:16 +00:00
temperature : 0.0 ,
} ) ,
2026-01-19 21:05:51 +00:00
signal : AbortSignal.timeout ( 600000 ) , // 10 minute timeout for OCR
2026-01-18 15:54:16 +00:00
} ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
const errorText = await response . text ( ) ;
2026-01-18 23:00:24 +00:00
throw new Error ( ` Nanonets API error: ${ response . status } - ${ errorText } ` ) ;
2026-01-18 15:54:16 +00:00
}
const data = await response . json ( ) ;
2026-01-19 11:51:23 +00:00
let responseContent = ( data . choices ? . [ 0 ] ? . message ? . content || '' ) . trim ( ) ;
// For single-page batches, add page marker if not present
if ( batch . length === 1 && ! responseContent . includes ( '--- PAGE' ) ) {
responseContent = ` --- PAGE ${ batch [ 0 ] . pageNum } --- \ n ${ responseContent } ` ;
}
console . log ( ` Pages [ ${ pageNums } ]: ${ responseContent . length } chars ( ${ elapsed } s) ` ) ;
return responseContent ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert all pages of a document to markdown using smart batching
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertDocumentToMarkdown ( images : IImageData [ ] , docName : string ) : Promise < string > {
const batches = batchImages ( images ) ;
console . log ( ` [ ${ docName } ] Processing ${ images . length } page(s) in ${ batches . length } batch(es)... ` ) ;
const markdownParts : string [ ] = [ ] ;
for ( let i = 0 ; i < batches . length ; i ++ ) {
const batch = batches [ i ] ;
const batchTokens = batch . reduce ( ( sum , img ) = > sum + estimateVisualTokens ( img . width , img . height ) , 0 ) ;
console . log ( ` Batch ${ i + 1 } : ${ batch . length } page(s), ~ ${ batchTokens } tokens ` ) ;
const markdown = await convertBatchToMarkdown ( batch ) ;
markdownParts . push ( markdown ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-19 11:51:23 +00:00
const fullMarkdown = markdownParts . join ( '\n\n' ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Complete: ${ fullMarkdown . length } chars total ` ) ;
2026-01-18 15:54:16 +00:00
return fullMarkdown ;
}
2026-01-18 23:00:24 +00:00
/ * *
* Stop Nanonets container
* /
function stopNanonets ( ) : void {
console . log ( ' [Docker] Stopping Nanonets container...' ) ;
try {
execSync ( 'docker stop nanonets-test 2>/dev/null || true' , { stdio : 'pipe' } ) ;
execSync ( 'sleep 5' , { stdio : 'pipe' } ) ;
console . log ( ' [Docker] Nanonets stopped' ) ;
} catch {
console . log ( ' [Docker] Nanonets was not running' ) ;
}
}
2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Ensure GPT - OSS 20 B model is available
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function ensureExtractionModel ( ) : Promise < boolean > {
2026-01-18 15:54:16 +00:00
try {
const response = await fetch ( ` ${ OLLAMA_URL } /api/tags ` ) ;
if ( response . ok ) {
const data = await response . json ( ) ;
const models = data . models || [ ] ;
2026-01-19 11:51:23 +00:00
if ( models . some ( ( m : { name : string } ) = > m . name === EXTRACTION_MODEL ) ) {
console . log ( ` [Ollama] Model available: ${ EXTRACTION_MODEL } ` ) ;
2026-01-18 15:54:16 +00:00
return true ;
}
}
} catch {
return false ;
}
2026-01-19 11:51:23 +00:00
console . log ( ` [Ollama] Pulling ${ EXTRACTION_MODEL } ... ` ) ;
2026-01-18 15:54:16 +00:00
const pullResponse = await fetch ( ` ${ OLLAMA_URL } /api/pull ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
2026-01-19 11:51:23 +00:00
body : JSON.stringify ( { name : EXTRACTION_MODEL , stream : false } ) ,
2026-01-18 15:54:16 +00:00
} ) ;
return pullResponse . ok ;
}
/ * *
* Parse amount from string ( handles European format )
* /
function parseAmount ( s : string | number | undefined ) : number {
if ( s === undefined || s === null ) return 0 ;
if ( typeof s === 'number' ) return s ;
const match = s . match ( /([\d.,]+)/ ) ;
if ( ! match ) return 0 ;
const numStr = match [ 1 ] ;
const normalized = numStr . includes ( ',' ) && numStr . indexOf ( ',' ) > numStr . lastIndexOf ( '.' )
? numStr . replace ( /\./g , '' ) . replace ( ',' , '.' )
: numStr . replace ( /,/g , '' ) ;
return parseFloat ( normalized ) || 0 ;
}
/ * *
* Extract invoice number from potentially verbose response
* /
function extractInvoiceNumber ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const patterns = [
2026-01-18 23:00:24 +00:00
/\b([A-Z]{2,3}\d{10,})\b/i ,
/\b([A-Z]\d{8,})\b/i ,
/\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i ,
/\b(\d{7,})\b/ ,
2026-01-18 15:54:16 +00:00
] ;
for ( const pattern of patterns ) {
const match = clean . match ( pattern ) ;
if ( match ) return match [ 1 ] ;
}
return clean . replace ( /[^A-Z0-9-]/gi , '' ) . trim ( ) || clean ;
}
/ * *
* Extract date ( YYYY - MM - DD ) from response
* /
function extractDate ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const isoMatch = clean . match ( /(\d{4}-\d{2}-\d{2})/ ) ;
if ( isoMatch ) return isoMatch [ 1 ] ;
const dmyMatch = clean . match ( /(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/ ) ;
if ( dmyMatch ) {
return ` ${ dmyMatch [ 3 ] } - ${ dmyMatch [ 2 ] . padStart ( 2 , '0' ) } - ${ dmyMatch [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return clean . replace ( /[^\d-]/g , '' ) . trim ( ) ;
}
/ * *
* Extract currency
* /
function extractCurrency ( s : string | undefined ) : string {
if ( ! s ) return 'EUR' ;
const upper = s . toUpperCase ( ) ;
if ( upper . includes ( 'EUR' ) || upper . includes ( '€' ) ) return 'EUR' ;
if ( upper . includes ( 'USD' ) || upper . includes ( '$' ) ) return 'USD' ;
if ( upper . includes ( 'GBP' ) || upper . includes ( '£' ) ) return 'GBP' ;
return 'EUR' ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract JSON from response
2026-01-18 15:54:16 +00:00
* /
function extractJsonFromResponse ( response : string ) : Record < string , unknown > | null {
let cleanResponse = response . replace ( /<think>[\s\S]*?<\/think>/g , '' ) . trim ( ) ;
const codeBlockMatch = cleanResponse . match ( /```(?:json)?\s*([\s\S]*?)```/ ) ;
const jsonStr = codeBlockMatch ? codeBlockMatch [ 1 ] . trim ( ) : cleanResponse ;
try {
return JSON . parse ( jsonStr ) ;
} catch {
const jsonMatch = jsonStr . match ( /\{[\s\S]*\}/ ) ;
if ( jsonMatch ) {
try {
return JSON . parse ( jsonMatch [ 0 ] ) ;
} catch {
return null ;
}
}
return null ;
}
}
/ * *
* Parse JSON response into IInvoice
* /
function parseJsonToInvoice ( response : string ) : IInvoice | null {
const parsed = extractJsonFromResponse ( response ) ;
if ( ! parsed ) return null ;
return {
invoice_number : extractInvoiceNumber ( String ( parsed . invoice_number || '' ) ) ,
invoice_date : extractDate ( String ( parsed . invoice_date || '' ) ) ,
vendor_name : String ( parsed . vendor_name || '' ) . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ,
currency : extractCurrency ( String ( parsed . currency || '' ) ) ,
net_amount : parseAmount ( parsed . net_amount as string | number ) ,
vat_amount : parseAmount ( parsed . vat_amount as string | number ) ,
total_amount : parseAmount ( parsed . total_amount as string | number ) ,
} ;
}
/ * *
2026-01-20 00:39:36 +00:00
* Extract invoice from markdown using smartagent DualAgentOrchestrator
2026-01-18 15:54:16 +00:00
* /
async function extractInvoiceFromMarkdown ( markdown : string , queryId : string ) : Promise < IInvoice | null > {
const startTime = Date . now ( ) ;
2026-01-19 11:51:23 +00:00
2026-01-20 00:39:36 +00:00
console . log ( ` [ ${ queryId } ] Invoice: ${ markdown . length } chars ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
// Build the extraction task with document context
const taskPrompt = ` Extract the invoice data from this document and output ONLY the JSON:
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
$ { markdown }
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
$ { JSON_EXTRACTION_PROMPT } ` ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
try {
2026-01-20 00:39:36 +00:00
const result = await orchestrator . run ( taskPrompt ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
console . log ( ` [ ${ queryId } ] Status: ${ result . status } , Iterations: ${ result . iterations } ( ${ elapsed } s) ` ) ;
if ( result . success && result . result ) {
console . log ( ` [ ${ queryId } ] Result: ${ result . result . substring ( 0 , 100 ) } ... ` ) ;
return parseJsonToInvoice ( result . result ) ;
2026-01-19 11:51:23 +00:00
}
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
// Fallback: try parsing from history
if ( result . history ? . length > 0 ) {
const lastMessage = result . history [ result . history . length - 1 ] ;
if ( lastMessage ? . content ) {
return parseJsonToInvoice ( lastMessage . content ) ;
}
}
2026-01-19 11:51:23 +00:00
2026-01-20 00:39:36 +00:00
return null ;
} catch ( error ) {
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
console . log ( ` [ ${ queryId } ] ERROR: ${ error } ( ${ elapsed } s) ` ) ;
throw error ;
}
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-19 11:51:23 +00:00
* Extract invoice ( single pass - GPT - OSS is more reliable )
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function extractInvoice ( markdown : string , docName : string ) : Promise < IInvoice > {
console . log ( ` [ ${ docName } ] Extracting... ` ) ;
const invoice = await extractInvoiceFromMarkdown ( markdown , docName ) ;
if ( ! invoice ) {
return {
invoice_number : '' ,
invoice_date : '' ,
vendor_name : '' ,
currency : 'EUR' ,
net_amount : 0 ,
vat_amount : 0 ,
total_amount : 0 ,
} ;
2026-01-18 15:54:16 +00:00
}
2026-01-19 11:51:23 +00:00
console . log ( ` [ ${ docName } ] Extracted: ${ invoice . invoice_number } ` ) ;
return invoice ;
2026-01-18 15:54:16 +00:00
}
/ * *
* Normalize date to YYYY - MM - DD
* /
function normalizeDate ( dateStr : string | null ) : string {
if ( ! dateStr ) return '' ;
if ( /^\d{4}-\d{2}-\d{2}$/ . test ( dateStr ) ) return dateStr ;
const monthMap : Record < string , string > = {
JAN : '01' , FEB : '02' , MAR : '03' , APR : '04' , MAY : '05' , JUN : '06' ,
JUL : '07' , AUG : '08' , SEP : '09' , OCT : '10' , NOV : '11' , DEC : '12' ,
} ;
let match = dateStr . match ( /^(\d{1,2})-([A-Z]{3})-(\d{4})$/i ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ monthMap [ match [ 2 ] . toUpperCase ( ) ] || '01' } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
match = dateStr . match ( /^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/ ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ match [ 2 ] . padStart ( 2 , '0' ) } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return dateStr ;
}
/ * *
* Compare extracted invoice against expected
* /
function compareInvoice (
extracted : IInvoice ,
expected : IInvoice
) : { match : boolean ; errors : string [ ] } {
const errors : string [ ] = [ ] ;
const extNum = extracted . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
const expNum = expected . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
if ( extNum !== expNum ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_number: exp " ${ expected . invoice_number } ", got " ${ extracted . invoice_number } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( normalizeDate ( extracted . invoice_date ) !== normalizeDate ( expected . invoice_date ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_date: exp " ${ expected . invoice_date } ", got " ${ extracted . invoice_date } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( Math . abs ( extracted . total_amount - expected . total_amount ) > 0.02 ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` total_amount: exp ${ expected . total_amount } , got ${ extracted . total_amount } ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( extracted . currency ? . toUpperCase ( ) !== expected . currency ? . toUpperCase ( ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` currency: exp " ${ expected . currency } ", got " ${ extracted . currency } " ` ) ;
2026-01-18 15:54:16 +00:00
}
return { match : errors.length === 0 , errors } ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Find all test cases
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
function findTestCases ( ) : ITestCase [ ] {
2026-01-18 15:54:16 +00:00
const testDir = path . join ( process . cwd ( ) , '.nogit/invoices' ) ;
2026-01-18 23:00:24 +00:00
if ( ! fs . existsSync ( testDir ) ) return [ ] ;
2026-01-18 15:54:16 +00:00
const files = fs . readdirSync ( testDir ) ;
2026-01-18 23:00:24 +00:00
const testCases : ITestCase [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
for ( const pdf of files . filter ( ( f ) = > f . endsWith ( '.pdf' ) ) ) {
2026-01-18 15:54:16 +00:00
const baseName = pdf . replace ( '.pdf' , '' ) ;
const jsonFile = ` ${ baseName } .json ` ;
if ( files . includes ( jsonFile ) ) {
testCases . push ( {
name : baseName ,
pdfPath : path.join ( testDir , pdf ) ,
jsonPath : path.join ( testDir , jsonFile ) ,
} ) ;
}
}
2026-01-18 23:00:24 +00:00
return testCases . sort ( ( a , b ) = > a . name . localeCompare ( b . name ) ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-18 23:00:24 +00:00
// ============ TESTS ============
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const testCases = findTestCases ( ) ;
console . log ( ` \ nFound ${ testCases . length } invoice test cases \ n ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
// Ensure cache directory exists
if ( ! fs . existsSync ( MD_CACHE_DIR ) ) {
fs . mkdirSync ( MD_CACHE_DIR , { recursive : true } ) ;
2026-01-18 23:00:24 +00:00
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// -------- STAGE 1: OCR with Nanonets --------
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
tap . test ( 'Stage 1: Convert invoices to markdown (with caching)' , async ( ) = > {
2026-01-18 23:00:24 +00:00
console . log ( '\n========== STAGE 1: Nanonets OCR ==========\n' ) ;
2026-01-20 00:39:36 +00:00
// Check which invoices need OCR conversion
const needsConversion : ITestCase [ ] = [ ] ;
let cachedCount = 0 ;
2026-01-18 23:00:24 +00:00
for ( const tc of testCases ) {
2026-01-20 00:39:36 +00:00
const mdPath = path . join ( MD_CACHE_DIR , ` ${ tc . name } .md ` ) ;
if ( fs . existsSync ( mdPath ) ) {
cachedCount ++ ;
tc . markdownPath = mdPath ;
console . log ( ` [CACHED] ${ tc . name } - using cached markdown ` ) ;
} else {
needsConversion . push ( tc ) ;
}
}
console . log ( ` \ n Summary: ${ cachedCount } cached, ${ needsConversion . length } need conversion \ n ` ) ;
if ( needsConversion . length === 0 ) {
console . log ( ' All invoices already cached, skipping Nanonets OCR\n' ) ;
return ;
}
// Start Nanonets only if there are files to convert
console . log ( ' Starting Nanonets for OCR conversion...\n' ) ;
const ok = await ensureNanonetsOcr ( ) ;
expect ( ok ) . toBeTrue ( ) ;
// Convert only the invoices that need conversion
for ( const tc of needsConversion ) {
2026-01-18 23:00:24 +00:00
console . log ( ` \ n === ${ tc . name } === ` ) ;
const images = convertPdfToImages ( tc . pdfPath ) ;
console . log ( ` Pages: ${ images . length } ` ) ;
const markdown = await convertDocumentToMarkdown ( images , tc . name ) ;
2026-01-18 15:54:16 +00:00
2026-01-20 00:39:36 +00:00
const mdPath = path . join ( MD_CACHE_DIR , ` ${ tc . name } .md ` ) ;
2026-01-18 23:00:24 +00:00
fs . writeFileSync ( mdPath , markdown ) ;
tc . markdownPath = mdPath ;
console . log ( ` Saved: ${ mdPath } ` ) ;
}
2026-01-20 00:39:36 +00:00
console . log ( ` \ n Stage 1 complete: ${ needsConversion . length } invoices converted to markdown \ n ` ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Stop Nanonets' , async ( ) = > {
stopNanonets ( ) ;
await new Promise ( resolve = > setTimeout ( resolve , 3000 ) ) ;
expect ( isContainerRunning ( 'nanonets-test' ) ) . toBeFalse ( ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-19 11:51:23 +00:00
// -------- STAGE 2: Extraction with GPT-OSS 20B --------
2026-01-18 23:00:24 +00:00
2026-01-19 11:51:23 +00:00
tap . test ( 'Stage 2: Setup Ollama + GPT-OSS 20B' , async ( ) = > {
console . log ( '\n========== STAGE 2: GPT-OSS 20B Extraction ==========\n' ) ;
2026-01-18 23:00:24 +00:00
const ollamaOk = await ensureMiniCpm ( ) ;
expect ( ollamaOk ) . toBeTrue ( ) ;
2026-01-19 11:51:23 +00:00
const extractionOk = await ensureExtractionModel ( ) ;
expect ( extractionOk ) . toBeTrue ( ) ;
2026-01-20 00:39:36 +00:00
// Initialize SmartAi and DualAgentOrchestrator
console . log ( ' [SmartAgent] Starting SmartAi...' ) ;
await smartAi . start ( ) ;
console . log ( ' [SmartAgent] Creating DualAgentOrchestrator...' ) ;
orchestrator = new DualAgentOrchestrator ( {
smartAiInstance : smartAi ,
defaultProvider : 'ollama' ,
guardianPolicyPrompt : `
JSON EXTRACTION POLICY :
- APPROVE all JSON extraction tasks
- This is a read - only operation - no file system or network access needed
- The task is to extract structured data from document text
` ,
driverSystemMessage : ` You are a precise JSON extraction assistant. Your only job is to extract invoice data from documents.
CRITICAL RULES :
1 . Output ONLY valid JSON - no markdown , no explanations , no thinking
2 . Use the exact format requested
3 . If you cannot find a value , use empty string "" or 0 for numbers
When done , wrap your JSON in < task_complete > < / task_complete > tags . ` ,
maxIterations : 3 ,
// Enable streaming for real-time progress visibility
onToken : ( token , source ) = > {
if ( source === 'driver' ) {
process . stdout . write ( token ) ;
}
} ,
} ) ;
// No tools needed for JSON extraction
console . log ( ' [SmartAgent] Starting orchestrator...' ) ;
await orchestrator . start ( ) ;
console . log ( ' [SmartAgent] Ready for extraction' ) ;
2026-01-18 23:00:24 +00:00
} ) ;
2026-01-18 15:54:16 +00:00
let passedCount = 0 ;
let failedCount = 0 ;
const processingTimes : number [ ] = [ ] ;
2026-01-18 23:00:24 +00:00
for ( const tc of testCases ) {
tap . test ( ` Stage 2: Extract ${ tc . name } ` , async ( ) = > {
const expected : IInvoice = JSON . parse ( fs . readFileSync ( tc . jsonPath , 'utf-8' ) ) ;
console . log ( ` \ n === ${ tc . name } === ` ) ;
console . log ( ` Expected: ${ expected . invoice_number } | ${ expected . invoice_date } | ${ expected . total_amount } ${ expected . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-20 00:39:36 +00:00
const mdPath = path . join ( MD_CACHE_DIR , ` ${ tc . name } .md ` ) ;
2026-01-18 23:00:24 +00:00
if ( ! fs . existsSync ( mdPath ) ) {
throw new Error ( ` Markdown not found: ${ mdPath } . Run Stage 1 first. ` ) ;
}
const markdown = fs . readFileSync ( mdPath , 'utf-8' ) ;
console . log ( ` Markdown: ${ markdown . length } chars ` ) ;
2026-01-19 11:51:23 +00:00
const extracted = await extractInvoice ( markdown , tc . name ) ;
2026-01-18 15:54:16 +00:00
const elapsedMs = Date . now ( ) - startTime ;
processingTimes . push ( elapsedMs ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Extracted: ${ extracted . invoice_number } | ${ extracted . invoice_date } | ${ extracted . total_amount } ${ extracted . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const result = compareInvoice ( extracted , expected ) ;
if ( result . match ) {
passedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
2026-01-18 15:54:16 +00:00
} else {
failedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MISMATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
result . errors . forEach ( e = > console . log ( ` - ${ e } ` ) ) ;
2026-01-18 15:54:16 +00:00
}
expect ( result . match ) . toBeTrue ( ) ;
} ) ;
}
2026-01-18 23:00:24 +00:00
tap . test ( 'Summary' , async ( ) = > {
2026-01-20 00:39:36 +00:00
// Cleanup orchestrator and SmartAi
if ( orchestrator ) {
console . log ( '\n [SmartAgent] Stopping orchestrator...' ) ;
await orchestrator . stop ( ) ;
}
console . log ( ' [SmartAgent] Stopping SmartAi...' ) ;
await smartAi . stop ( ) ;
2026-01-18 15:54:16 +00:00
const totalInvoices = testCases . length ;
const accuracy = totalInvoices > 0 ? ( passedCount / totalInvoices ) * 100 : 0 ;
const totalTimeMs = processingTimes . reduce ( ( a , b ) = > a + b , 0 ) ;
const avgTimeSec = processingTimes . length > 0 ? totalTimeMs / processingTimes . length / 1000 : 0 ;
console . log ( ` \ n======================================== ` ) ;
2026-01-19 11:51:23 +00:00
console . log ( ` Invoice Summary (Nanonets + GPT-OSS 20B) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` ======================================== ` ) ;
console . log ( ` Stage 1: Nanonets-OCR-s (doc -> md) ` ) ;
2026-01-20 00:39:36 +00:00
console . log ( ` Stage 2: GPT-OSS 20B + SmartAgent (md -> JSON) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` Passed: ${ passedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Failed: ${ failedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Accuracy: ${ accuracy . toFixed ( 1 ) } % ` ) ;
console . log ( ` ---------------------------------------- ` ) ;
console . log ( ` Total time: ${ ( totalTimeMs / 1000 ) . toFixed ( 1 ) } s ` ) ;
console . log ( ` Avg per inv: ${ avgTimeSec . toFixed ( 1 ) } s ` ) ;
console . log ( ` ======================================== \ n ` ) ;
2026-01-20 00:39:36 +00:00
console . log ( ` Cache location: ${ MD_CACHE_DIR } \ n ` ) ;
2026-01-18 15:54:16 +00:00
} ) ;
export default tap . start ( ) ;