2026-01-18 15:54:16 +00:00
/ * *
2026-01-18 23:00:24 +00:00
* Invoice extraction using Nanonets - OCR - s + Qwen3 ( sequential two - stage pipeline )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* Stage 1 : Nanonets - OCR - s converts ALL document pages to markdown ( stop after completion )
* Stage 2 : Qwen3 extracts structured JSON from saved markdown ( after Nanonets stops )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* This approach avoids GPU contention by running services sequentially .
2026-01-18 15:54:16 +00:00
* /
import { tap , expect } from '@git.zone/tstest/tapbundle' ;
import * as fs from 'fs' ;
import * as path from 'path' ;
import { execSync } from 'child_process' ;
import * as os from 'os' ;
2026-01-18 23:00:24 +00:00
import { ensureNanonetsOcr , ensureMiniCpm , isContainerRunning } from './helpers/docker.js' ;
2026-01-18 15:54:16 +00:00
const NANONETS_URL = 'http://localhost:8000/v1' ;
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s' ;
const OLLAMA_URL = 'http://localhost:11434' ;
const QWEN_MODEL = 'qwen3:8b' ;
2026-01-18 23:00:24 +00:00
// Temp directory for storing markdown between stages
const TEMP_MD_DIR = path . join ( os . tmpdir ( ) , 'nanonets-invoices-markdown' ) ;
2026-01-18 15:54:16 +00:00
interface IInvoice {
invoice_number : string ;
invoice_date : string ;
vendor_name : string ;
currency : string ;
net_amount : number ;
vat_amount : number ;
total_amount : number ;
}
2026-01-18 23:00:24 +00:00
interface ITestCase {
name : string ;
pdfPath : string ;
jsonPath : string ;
markdownPath? : string ;
}
2026-01-18 15:54:16 +00:00
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = ` Extract the text from the above document as if you were reading it naturally.
Return the tables in html format .
Return the equations in LaTeX representation .
If there is an image in the document and image caption is not present , add a small description inside < img > < / img > tag .
Watermarks should be wrapped in brackets . Ex : < watermark > OFFICIAL COPY < / watermark > .
Page numbers should be wrapped in brackets . Ex : < page_number > 14 < / page_number > . ` ;
// JSON extraction prompt for Qwen3
const JSON_EXTRACTION_PROMPT = ` You are an invoice data extractor. Below is an invoice document converted to text/markdown. Extract the key invoice fields as JSON.
IMPORTANT RULES :
1 . invoice_number : The unique invoice / document number ( NOT VAT ID , NOT customer ID )
2 . invoice_date : Format as YYYY - MM - DD
3 . vendor_name : The company that issued the invoice
4 . currency : EUR , USD , or GBP
5 . net_amount : Amount before tax
6 . vat_amount : Tax / VAT amount
7 . total_amount : Final total ( gross amount )
Return ONLY this JSON format , no explanation :
{
"invoice_number" : "INV-2024-001" ,
"invoice_date" : "2024-01-15" ,
"vendor_name" : "Company Name" ,
"currency" : "EUR" ,
"net_amount" : 100.00 ,
"vat_amount" : 19.00 ,
"total_amount" : 119.00
}
INVOICE TEXT :
` ;
/ * *
2026-01-18 23:00:24 +00:00
* Convert PDF to PNG images
2026-01-18 15:54:16 +00:00
* /
function convertPdfToImages ( pdfPath : string ) : string [ ] {
const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'pdf-convert-' ) ) ;
const outputPattern = path . join ( tempDir , 'page-%d.png' ) ;
try {
execSync (
` convert -density 150 -quality 90 " ${ pdfPath } " -background white -alpha remove " ${ outputPattern } " ` ,
{ stdio : 'pipe' }
) ;
const files = fs . readdirSync ( tempDir ) . filter ( ( f ) = > f . endsWith ( '.png' ) ) . sort ( ) ;
const images : string [ ] = [ ] ;
for ( const file of files ) {
const imagePath = path . join ( tempDir , file ) ;
const imageData = fs . readFileSync ( imagePath ) ;
images . push ( imageData . toString ( 'base64' ) ) ;
}
return images ;
} finally {
fs . rmSync ( tempDir , { recursive : true , force : true } ) ;
}
}
/ * *
2026-01-18 23:00:24 +00:00
* Convert a single page to markdown using Nanonets - OCR - s
2026-01-18 15:54:16 +00:00
* /
async function convertPageToMarkdown ( image : string , pageNum : number ) : Promise < string > {
const startTime = Date . now ( ) ;
const response = await fetch ( ` ${ NANONETS_URL } /chat/completions ` , {
method : 'POST' ,
headers : {
'Content-Type' : 'application/json' ,
'Authorization' : 'Bearer dummy' ,
} ,
body : JSON.stringify ( {
model : NANONETS_MODEL ,
messages : [ {
role : 'user' ,
content : [
{ type : 'image_url' , image_url : { url : ` data:image/png;base64, ${ image } ` } } ,
{ type : 'text' , text : NANONETS_OCR_PROMPT } ,
] ,
} ] ,
max_tokens : 4096 ,
temperature : 0.0 ,
} ) ,
} ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
const errorText = await response . text ( ) ;
2026-01-18 23:00:24 +00:00
throw new Error ( ` Nanonets API error: ${ response . status } - ${ errorText } ` ) ;
2026-01-18 15:54:16 +00:00
}
const data = await response . json ( ) ;
const content = ( data . choices ? . [ 0 ] ? . message ? . content || '' ) . trim ( ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Page ${ pageNum } : ${ content . length } chars ( ${ elapsed } s) ` ) ;
2026-01-18 15:54:16 +00:00
return content ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Convert all pages of a document to markdown
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
async function convertDocumentToMarkdown ( images : string [ ] , docName : string ) : Promise < string > {
console . log ( ` [ ${ docName } ] Converting ${ images . length } page(s)... ` ) ;
2026-01-18 15:54:16 +00:00
const markdownPages : string [ ] = [ ] ;
for ( let i = 0 ; i < images . length ; i ++ ) {
const markdown = await convertPageToMarkdown ( images [ i ] , i + 1 ) ;
markdownPages . push ( ` --- PAGE ${ i + 1 } --- \ n ${ markdown } ` ) ;
}
const fullMarkdown = markdownPages . join ( '\n\n' ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Complete: ${ fullMarkdown . length } chars total ` ) ;
2026-01-18 15:54:16 +00:00
return fullMarkdown ;
}
2026-01-18 23:00:24 +00:00
/ * *
* Stop Nanonets container
* /
function stopNanonets ( ) : void {
console . log ( ' [Docker] Stopping Nanonets container...' ) ;
try {
execSync ( 'docker stop nanonets-test 2>/dev/null || true' , { stdio : 'pipe' } ) ;
execSync ( 'sleep 5' , { stdio : 'pipe' } ) ;
console . log ( ' [Docker] Nanonets stopped' ) ;
} catch {
console . log ( ' [Docker] Nanonets was not running' ) ;
}
}
2026-01-18 15:54:16 +00:00
/ * *
* Ensure Qwen3 model is available
* /
async function ensureQwen3 ( ) : Promise < boolean > {
try {
const response = await fetch ( ` ${ OLLAMA_URL } /api/tags ` ) ;
if ( response . ok ) {
const data = await response . json ( ) ;
const models = data . models || [ ] ;
if ( models . some ( ( m : { name : string } ) = > m . name === QWEN_MODEL ) ) {
console . log ( ` [Ollama] Model available: ${ QWEN_MODEL } ` ) ;
return true ;
}
}
} catch {
return false ;
}
console . log ( ` [Ollama] Pulling ${ QWEN_MODEL } ... ` ) ;
const pullResponse = await fetch ( ` ${ OLLAMA_URL } /api/pull ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( { name : QWEN_MODEL , stream : false } ) ,
} ) ;
return pullResponse . ok ;
}
/ * *
* Parse amount from string ( handles European format )
* /
function parseAmount ( s : string | number | undefined ) : number {
if ( s === undefined || s === null ) return 0 ;
if ( typeof s === 'number' ) return s ;
const match = s . match ( /([\d.,]+)/ ) ;
if ( ! match ) return 0 ;
const numStr = match [ 1 ] ;
const normalized = numStr . includes ( ',' ) && numStr . indexOf ( ',' ) > numStr . lastIndexOf ( '.' )
? numStr . replace ( /\./g , '' ) . replace ( ',' , '.' )
: numStr . replace ( /,/g , '' ) ;
return parseFloat ( normalized ) || 0 ;
}
/ * *
* Extract invoice number from potentially verbose response
* /
function extractInvoiceNumber ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const patterns = [
2026-01-18 23:00:24 +00:00
/\b([A-Z]{2,3}\d{10,})\b/i ,
/\b([A-Z]\d{8,})\b/i ,
/\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i ,
/\b(\d{7,})\b/ ,
2026-01-18 15:54:16 +00:00
] ;
for ( const pattern of patterns ) {
const match = clean . match ( pattern ) ;
if ( match ) return match [ 1 ] ;
}
return clean . replace ( /[^A-Z0-9-]/gi , '' ) . trim ( ) || clean ;
}
/ * *
* Extract date ( YYYY - MM - DD ) from response
* /
function extractDate ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const isoMatch = clean . match ( /(\d{4}-\d{2}-\d{2})/ ) ;
if ( isoMatch ) return isoMatch [ 1 ] ;
const dmyMatch = clean . match ( /(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/ ) ;
if ( dmyMatch ) {
return ` ${ dmyMatch [ 3 ] } - ${ dmyMatch [ 2 ] . padStart ( 2 , '0' ) } - ${ dmyMatch [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return clean . replace ( /[^\d-]/g , '' ) . trim ( ) ;
}
/ * *
* Extract currency
* /
function extractCurrency ( s : string | undefined ) : string {
if ( ! s ) return 'EUR' ;
const upper = s . toUpperCase ( ) ;
if ( upper . includes ( 'EUR' ) || upper . includes ( '€' ) ) return 'EUR' ;
if ( upper . includes ( 'USD' ) || upper . includes ( '$' ) ) return 'USD' ;
if ( upper . includes ( 'GBP' ) || upper . includes ( '£' ) ) return 'GBP' ;
return 'EUR' ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract JSON from response
2026-01-18 15:54:16 +00:00
* /
function extractJsonFromResponse ( response : string ) : Record < string , unknown > | null {
let cleanResponse = response . replace ( /<think>[\s\S]*?<\/think>/g , '' ) . trim ( ) ;
const codeBlockMatch = cleanResponse . match ( /```(?:json)?\s*([\s\S]*?)```/ ) ;
const jsonStr = codeBlockMatch ? codeBlockMatch [ 1 ] . trim ( ) : cleanResponse ;
try {
return JSON . parse ( jsonStr ) ;
} catch {
const jsonMatch = jsonStr . match ( /\{[\s\S]*\}/ ) ;
if ( jsonMatch ) {
try {
return JSON . parse ( jsonMatch [ 0 ] ) ;
} catch {
return null ;
}
}
return null ;
}
}
/ * *
* Parse JSON response into IInvoice
* /
function parseJsonToInvoice ( response : string ) : IInvoice | null {
const parsed = extractJsonFromResponse ( response ) ;
if ( ! parsed ) return null ;
return {
invoice_number : extractInvoiceNumber ( String ( parsed . invoice_number || '' ) ) ,
invoice_date : extractDate ( String ( parsed . invoice_date || '' ) ) ,
vendor_name : String ( parsed . vendor_name || '' ) . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ,
currency : extractCurrency ( String ( parsed . currency || '' ) ) ,
net_amount : parseAmount ( parsed . net_amount as string | number ) ,
vat_amount : parseAmount ( parsed . vat_amount as string | number ) ,
total_amount : parseAmount ( parsed . total_amount as string | number ) ,
} ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract invoice from markdown using Qwen3
2026-01-18 15:54:16 +00:00
* /
async function extractInvoiceFromMarkdown ( markdown : string , queryId : string ) : Promise < IInvoice | null > {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ queryId } ] Sending to ${ QWEN_MODEL } ... ` ) ;
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
const response = await fetch ( ` ${ OLLAMA_URL } /api/chat ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
2026-01-18 23:00:24 +00:00
signal : AbortSignal.timeout ( 600000 ) , // 10 minute timeout for large documents
2026-01-18 15:54:16 +00:00
body : JSON.stringify ( {
model : QWEN_MODEL ,
messages : [ {
role : 'user' ,
content : JSON_EXTRACTION_PROMPT + markdown ,
} ] ,
stream : false ,
options : {
num_predict : 2000 ,
temperature : 0.1 ,
} ,
} ) ,
} ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
console . log ( ` [ ${ queryId } ] ERROR: ${ response . status } ( ${ elapsed } s) ` ) ;
throw new Error ( ` Ollama API error: ${ response . status } ` ) ;
}
const data = await response . json ( ) ;
const content = ( data . message ? . content || '' ) . trim ( ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ queryId } ] Response: ${ content . length } chars ( ${ elapsed } s) ` ) ;
2026-01-18 15:54:16 +00:00
return parseJsonToInvoice ( content ) ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Compare two invoices for consensus
2026-01-18 15:54:16 +00:00
* /
function invoicesMatch ( a : IInvoice , b : IInvoice ) : boolean {
const numMatch = a . invoice_number . toLowerCase ( ) === b . invoice_number . toLowerCase ( ) ;
const dateMatch = a . invoice_date === b . invoice_date ;
const totalMatch = Math . abs ( a . total_amount - b . total_amount ) < 0.02 ;
return numMatch && dateMatch && totalMatch ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract with consensus
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
async function extractWithConsensus ( markdown : string , docName : string ) : Promise < IInvoice > {
2026-01-18 15:54:16 +00:00
const MAX_ATTEMPTS = 3 ;
for ( let attempt = 1 ; attempt <= MAX_ATTEMPTS ; attempt ++ ) {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Attempt ${ attempt } / ${ MAX_ATTEMPTS } ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const inv1 = await extractInvoiceFromMarkdown ( markdown , ` ${ docName } -A ${ attempt } Q1 ` ) ;
const inv2 = await extractInvoiceFromMarkdown ( markdown , ` ${ docName } -A ${ attempt } Q2 ` ) ;
2026-01-18 15:54:16 +00:00
if ( ! inv1 || ! inv2 ) {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Parsing failed, retrying... ` ) ;
2026-01-18 15:54:16 +00:00
continue ;
}
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Q1: ${ inv1 . invoice_number } | ${ inv1 . invoice_date } | ${ inv1 . total_amount } ` ) ;
console . log ( ` [ ${ docName } ] Q2: ${ inv2 . invoice_number } | ${ inv2 . invoice_date } | ${ inv2 . total_amount } ` ) ;
2026-01-18 15:54:16 +00:00
if ( invoicesMatch ( inv1 , inv2 ) ) {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] CONSENSUS ` ) ;
2026-01-18 15:54:16 +00:00
return inv2 ;
}
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] No consensus ` ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-18 23:00:24 +00:00
// Fallback
const fallback = await extractInvoiceFromMarkdown ( markdown , ` ${ docName } -FALLBACK ` ) ;
2026-01-18 15:54:16 +00:00
if ( fallback ) {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] FALLBACK: ${ fallback . invoice_number } | ${ fallback . invoice_date } | ${ fallback . total_amount } ` ) ;
2026-01-18 15:54:16 +00:00
return fallback ;
}
return {
invoice_number : '' ,
invoice_date : '' ,
vendor_name : '' ,
currency : 'EUR' ,
net_amount : 0 ,
vat_amount : 0 ,
total_amount : 0 ,
} ;
}
/ * *
* Normalize date to YYYY - MM - DD
* /
function normalizeDate ( dateStr : string | null ) : string {
if ( ! dateStr ) return '' ;
if ( /^\d{4}-\d{2}-\d{2}$/ . test ( dateStr ) ) return dateStr ;
const monthMap : Record < string , string > = {
JAN : '01' , FEB : '02' , MAR : '03' , APR : '04' , MAY : '05' , JUN : '06' ,
JUL : '07' , AUG : '08' , SEP : '09' , OCT : '10' , NOV : '11' , DEC : '12' ,
} ;
let match = dateStr . match ( /^(\d{1,2})-([A-Z]{3})-(\d{4})$/i ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ monthMap [ match [ 2 ] . toUpperCase ( ) ] || '01' } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
match = dateStr . match ( /^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/ ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ match [ 2 ] . padStart ( 2 , '0' ) } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return dateStr ;
}
/ * *
* Compare extracted invoice against expected
* /
function compareInvoice (
extracted : IInvoice ,
expected : IInvoice
) : { match : boolean ; errors : string [ ] } {
const errors : string [ ] = [ ] ;
const extNum = extracted . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
const expNum = expected . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
if ( extNum !== expNum ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_number: exp " ${ expected . invoice_number } ", got " ${ extracted . invoice_number } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( normalizeDate ( extracted . invoice_date ) !== normalizeDate ( expected . invoice_date ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_date: exp " ${ expected . invoice_date } ", got " ${ extracted . invoice_date } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( Math . abs ( extracted . total_amount - expected . total_amount ) > 0.02 ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` total_amount: exp ${ expected . total_amount } , got ${ extracted . total_amount } ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( extracted . currency ? . toUpperCase ( ) !== expected . currency ? . toUpperCase ( ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` currency: exp " ${ expected . currency } ", got " ${ extracted . currency } " ` ) ;
2026-01-18 15:54:16 +00:00
}
return { match : errors.length === 0 , errors } ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Find all test cases
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
function findTestCases ( ) : ITestCase [ ] {
2026-01-18 15:54:16 +00:00
const testDir = path . join ( process . cwd ( ) , '.nogit/invoices' ) ;
2026-01-18 23:00:24 +00:00
if ( ! fs . existsSync ( testDir ) ) return [ ] ;
2026-01-18 15:54:16 +00:00
const files = fs . readdirSync ( testDir ) ;
2026-01-18 23:00:24 +00:00
const testCases : ITestCase [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
for ( const pdf of files . filter ( ( f ) = > f . endsWith ( '.pdf' ) ) ) {
2026-01-18 15:54:16 +00:00
const baseName = pdf . replace ( '.pdf' , '' ) ;
const jsonFile = ` ${ baseName } .json ` ;
if ( files . includes ( jsonFile ) ) {
testCases . push ( {
name : baseName ,
pdfPath : path.join ( testDir , pdf ) ,
jsonPath : path.join ( testDir , jsonFile ) ,
} ) ;
}
}
2026-01-18 23:00:24 +00:00
return testCases . sort ( ( a , b ) = > a . name . localeCompare ( b . name ) ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-18 23:00:24 +00:00
// ============ TESTS ============
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const testCases = findTestCases ( ) ;
console . log ( ` \ nFound ${ testCases . length } invoice test cases \ n ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Ensure temp directory exists
if ( ! fs . existsSync ( TEMP_MD_DIR ) ) {
fs . mkdirSync ( TEMP_MD_DIR , { recursive : true } ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// -------- STAGE 1: OCR with Nanonets --------
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Setup Nanonets' , async ( ) = > {
console . log ( '\n========== STAGE 1: Nanonets OCR ==========\n' ) ;
const ok = await ensureNanonetsOcr ( ) ;
expect ( ok ) . toBeTrue ( ) ;
} ) ;
tap . test ( 'Stage 1: Convert all invoices to markdown' , async ( ) = > {
console . log ( '\n Converting all invoice PDFs to markdown with Nanonets-OCR-s...\n' ) ;
for ( const tc of testCases ) {
console . log ( ` \ n === ${ tc . name } === ` ) ;
const images = convertPdfToImages ( tc . pdfPath ) ;
console . log ( ` Pages: ${ images . length } ` ) ;
const markdown = await convertDocumentToMarkdown ( images , tc . name ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
fs . writeFileSync ( mdPath , markdown ) ;
tc . markdownPath = mdPath ;
console . log ( ` Saved: ${ mdPath } ` ) ;
}
console . log ( '\n Stage 1 complete: All invoices converted to markdown\n' ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Stop Nanonets' , async ( ) = > {
stopNanonets ( ) ;
await new Promise ( resolve = > setTimeout ( resolve , 3000 ) ) ;
expect ( isContainerRunning ( 'nanonets-test' ) ) . toBeFalse ( ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
// -------- STAGE 2: Extraction with Qwen3 --------
tap . test ( 'Stage 2: Setup Ollama + Qwen3' , async ( ) = > {
console . log ( '\n========== STAGE 2: Qwen3 Extraction ==========\n' ) ;
const ollamaOk = await ensureMiniCpm ( ) ;
expect ( ollamaOk ) . toBeTrue ( ) ;
const qwenOk = await ensureQwen3 ( ) ;
expect ( qwenOk ) . toBeTrue ( ) ;
} ) ;
2026-01-18 15:54:16 +00:00
let passedCount = 0 ;
let failedCount = 0 ;
const processingTimes : number [ ] = [ ] ;
2026-01-18 23:00:24 +00:00
for ( const tc of testCases ) {
tap . test ( ` Stage 2: Extract ${ tc . name } ` , async ( ) = > {
const expected : IInvoice = JSON . parse ( fs . readFileSync ( tc . jsonPath , 'utf-8' ) ) ;
console . log ( ` \ n === ${ tc . name } === ` ) ;
console . log ( ` Expected: ${ expected . invoice_number } | ${ expected . invoice_date } | ${ expected . total_amount } ${ expected . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-18 23:00:24 +00:00
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
if ( ! fs . existsSync ( mdPath ) ) {
throw new Error ( ` Markdown not found: ${ mdPath } . Run Stage 1 first. ` ) ;
}
const markdown = fs . readFileSync ( mdPath , 'utf-8' ) ;
console . log ( ` Markdown: ${ markdown . length } chars ` ) ;
const extracted = await extractWithConsensus ( markdown , tc . name ) ;
2026-01-18 15:54:16 +00:00
const elapsedMs = Date . now ( ) - startTime ;
processingTimes . push ( elapsedMs ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Extracted: ${ extracted . invoice_number } | ${ extracted . invoice_date } | ${ extracted . total_amount } ${ extracted . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const result = compareInvoice ( extracted , expected ) ;
if ( result . match ) {
passedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
2026-01-18 15:54:16 +00:00
} else {
failedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MISMATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
result . errors . forEach ( e = > console . log ( ` - ${ e } ` ) ) ;
2026-01-18 15:54:16 +00:00
}
expect ( result . match ) . toBeTrue ( ) ;
} ) ;
}
2026-01-18 23:00:24 +00:00
tap . test ( 'Summary' , async ( ) = > {
2026-01-18 15:54:16 +00:00
const totalInvoices = testCases . length ;
const accuracy = totalInvoices > 0 ? ( passedCount / totalInvoices ) * 100 : 0 ;
const totalTimeMs = processingTimes . reduce ( ( a , b ) = > a + b , 0 ) ;
const avgTimeSec = processingTimes . length > 0 ? totalTimeMs / processingTimes . length / 1000 : 0 ;
console . log ( ` \ n======================================== ` ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Invoice Summary (Nanonets + Qwen3) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` ======================================== ` ) ;
console . log ( ` Stage 1: Nanonets-OCR-s (doc -> md) ` ) ;
console . log ( ` Stage 2: Qwen3 8B (md -> JSON) ` ) ;
console . log ( ` Passed: ${ passedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Failed: ${ failedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Accuracy: ${ accuracy . toFixed ( 1 ) } % ` ) ;
console . log ( ` ---------------------------------------- ` ) ;
console . log ( ` Total time: ${ ( totalTimeMs / 1000 ) . toFixed ( 1 ) } s ` ) ;
console . log ( ` Avg per inv: ${ avgTimeSec . toFixed ( 1 ) } s ` ) ;
console . log ( ` ======================================== \ n ` ) ;
2026-01-18 23:00:24 +00:00
// Cleanup temp files
try {
fs . rmSync ( TEMP_MD_DIR , { recursive : true , force : true } ) ;
console . log ( ` Cleaned up temp directory: ${ TEMP_MD_DIR } \ n ` ) ;
} catch {
// Ignore
}
2026-01-18 15:54:16 +00:00
} ) ;
export default tap . start ( ) ;