2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Invoice extraction using Nanonets - OCR - s + GPT - OSS 20 B ( sequential two - stage pipeline )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* Stage 1 : Nanonets - OCR - s converts ALL document pages to markdown ( stop after completion )
2026-01-19 11:51:23 +00:00
* Stage 2 : GPT - OSS 20 B extracts structured JSON from saved markdown ( after Nanonets stops )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* This approach avoids GPU contention by running services sequentially .
2026-01-18 15:54:16 +00:00
* /
import { tap , expect } from '@git.zone/tstest/tapbundle' ;
import * as fs from 'fs' ;
import * as path from 'path' ;
import { execSync } from 'child_process' ;
import * as os from 'os' ;
2026-01-18 23:00:24 +00:00
import { ensureNanonetsOcr , ensureMiniCpm , isContainerRunning } from './helpers/docker.js' ;
2026-01-18 15:54:16 +00:00
const NANONETS_URL = 'http://localhost:8000/v1' ;
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s' ;
const OLLAMA_URL = 'http://localhost:11434' ;
2026-01-19 11:51:23 +00:00
const EXTRACTION_MODEL = 'gpt-oss:20b' ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Temp directory for storing markdown between stages
const TEMP_MD_DIR = path . join ( os . tmpdir ( ) , 'nanonets-invoices-markdown' ) ;
2026-01-18 15:54:16 +00:00
interface IInvoice {
invoice_number : string ;
invoice_date : string ;
vendor_name : string ;
currency : string ;
net_amount : number ;
vat_amount : number ;
total_amount : number ;
}
2026-01-19 11:51:23 +00:00
interface IImageData {
base64 : string ;
width : number ;
height : number ;
pageNum : number ;
}
2026-01-18 23:00:24 +00:00
interface ITestCase {
name : string ;
pdfPath : string ;
jsonPath : string ;
markdownPath? : string ;
}
2026-01-18 15:54:16 +00:00
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = ` Extract the text from the above document as if you were reading it naturally.
Return the tables in html format .
Return the equations in LaTeX representation .
If there is an image in the document and image caption is not present , add a small description inside < img > < / img > tag .
Watermarks should be wrapped in brackets . Ex : < watermark > OFFICIAL COPY < / watermark > .
Page numbers should be wrapped in brackets . Ex : < page_number > 14 < / page_number > . ` ;
2026-01-19 11:51:23 +00:00
// JSON extraction prompt for GPT-OSS 20B
2026-01-18 15:54:16 +00:00
const JSON_EXTRACTION_PROMPT = ` You are an invoice data extractor. Below is an invoice document converted to text/markdown. Extract the key invoice fields as JSON.
IMPORTANT RULES :
1 . invoice_number : The unique invoice / document number ( NOT VAT ID , NOT customer ID )
2 . invoice_date : Format as YYYY - MM - DD
3 . vendor_name : The company that issued the invoice
4 . currency : EUR , USD , or GBP
5 . net_amount : Amount before tax
6 . vat_amount : Tax / VAT amount
7 . total_amount : Final total ( gross amount )
Return ONLY this JSON format , no explanation :
{
"invoice_number" : "INV-2024-001" ,
"invoice_date" : "2024-01-15" ,
"vendor_name" : "Company Name" ,
"currency" : "EUR" ,
"net_amount" : 100.00 ,
"vat_amount" : 19.00 ,
"total_amount" : 119.00
}
INVOICE TEXT :
` ;
2026-01-19 11:51:23 +00:00
// Constants for smart batching
const MAX_VISUAL_TOKENS = 28000 ; // ~32K context minus prompt/output headroom
const PATCH_SIZE = 14 ; // Qwen2.5-VL uses 14x14 patches
/ * *
* Estimate visual tokens for an image based on dimensions
* /
function estimateVisualTokens ( width : number , height : number ) : number {
return Math . ceil ( ( width * height ) / ( PATCH_SIZE * PATCH_SIZE ) ) ;
}
/ * *
* Batch images to fit within context window
* /
function batchImages ( images : IImageData [ ] ) : IImageData [ ] [ ] {
const batches : IImageData [ ] [ ] = [ ] ;
let currentBatch : IImageData [ ] = [ ] ;
let currentTokens = 0 ;
for ( const img of images ) {
const imgTokens = estimateVisualTokens ( img . width , img . height ) ;
if ( currentTokens + imgTokens > MAX_VISUAL_TOKENS && currentBatch . length > 0 ) {
batches . push ( currentBatch ) ;
currentBatch = [ img ] ;
currentTokens = imgTokens ;
} else {
currentBatch . push ( img ) ;
currentTokens += imgTokens ;
}
}
if ( currentBatch . length > 0 ) batches . push ( currentBatch ) ;
return batches ;
}
2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Convert PDF to JPEG images using ImageMagick with dimension tracking
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
function convertPdfToImages ( pdfPath : string ) : IImageData [ ] {
2026-01-18 15:54:16 +00:00
const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'pdf-convert-' ) ) ;
2026-01-19 11:51:23 +00:00
const outputPattern = path . join ( tempDir , 'page-%d.jpg' ) ;
2026-01-18 15:54:16 +00:00
try {
execSync (
` convert -density 150 -quality 90 " ${ pdfPath } " -background white -alpha remove " ${ outputPattern } " ` ,
{ stdio : 'pipe' }
) ;
2026-01-19 11:51:23 +00:00
const files = fs . readdirSync ( tempDir ) . filter ( ( f : string ) = > f . endsWith ( '.jpg' ) ) . sort ( ) ;
const images : IImageData [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
for ( let i = 0 ; i < files . length ; i ++ ) {
const file = files [ i ] ;
2026-01-18 15:54:16 +00:00
const imagePath = path . join ( tempDir , file ) ;
const imageData = fs . readFileSync ( imagePath ) ;
2026-01-19 11:51:23 +00:00
// Get image dimensions using identify command
const dimensions = execSync ( ` identify -format "%w %h" " ${ imagePath } " ` , { encoding : 'utf-8' } ) . trim ( ) ;
const [ width , height ] = dimensions . split ( ' ' ) . map ( Number ) ;
images . push ( {
base64 : imageData.toString ( 'base64' ) ,
width ,
height ,
pageNum : i + 1 ,
} ) ;
2026-01-18 15:54:16 +00:00
}
return images ;
} finally {
fs . rmSync ( tempDir , { recursive : true , force : true } ) ;
}
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert a batch of pages to markdown using Nanonets - OCR - s
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertBatchToMarkdown ( batch : IImageData [ ] ) : Promise < string > {
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-19 11:51:23 +00:00
const pageNums = batch . map ( img = > img . pageNum ) . join ( ', ' ) ;
// Build content array with all images first, then the prompt
const content : Array < { type : string ; image_url ? : { url : string } ; text? : string } > = [ ] ;
for ( const img of batch ) {
content . push ( {
type : 'image_url' ,
image_url : { url : ` data:image/jpeg;base64, ${ img . base64 } ` } ,
} ) ;
}
// Add prompt with page separator instruction if multiple pages
const promptText = batch . length > 1
? ` ${ NANONETS_OCR_PROMPT } \ n \ nPlease clearly separate each page's content with "--- PAGE N ---" markers, where N is the page number starting from ${ batch [ 0 ] . pageNum } . `
: NANONETS_OCR_PROMPT ;
content . push ( { type : 'text' , text : promptText } ) ;
2026-01-18 15:54:16 +00:00
const response = await fetch ( ` ${ NANONETS_URL } /chat/completions ` , {
method : 'POST' ,
headers : {
'Content-Type' : 'application/json' ,
'Authorization' : 'Bearer dummy' ,
} ,
body : JSON.stringify ( {
model : NANONETS_MODEL ,
messages : [ {
role : 'user' ,
2026-01-19 11:51:23 +00:00
content ,
2026-01-18 15:54:16 +00:00
} ] ,
2026-01-19 11:51:23 +00:00
max_tokens : 4096 * batch . length , // Scale output tokens with batch size
2026-01-18 15:54:16 +00:00
temperature : 0.0 ,
} ) ,
} ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
const errorText = await response . text ( ) ;
2026-01-18 23:00:24 +00:00
throw new Error ( ` Nanonets API error: ${ response . status } - ${ errorText } ` ) ;
2026-01-18 15:54:16 +00:00
}
const data = await response . json ( ) ;
2026-01-19 11:51:23 +00:00
let responseContent = ( data . choices ? . [ 0 ] ? . message ? . content || '' ) . trim ( ) ;
// For single-page batches, add page marker if not present
if ( batch . length === 1 && ! responseContent . includes ( '--- PAGE' ) ) {
responseContent = ` --- PAGE ${ batch [ 0 ] . pageNum } --- \ n ${ responseContent } ` ;
}
console . log ( ` Pages [ ${ pageNums } ]: ${ responseContent . length } chars ( ${ elapsed } s) ` ) ;
return responseContent ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert all pages of a document to markdown using smart batching
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertDocumentToMarkdown ( images : IImageData [ ] , docName : string ) : Promise < string > {
const batches = batchImages ( images ) ;
console . log ( ` [ ${ docName } ] Processing ${ images . length } page(s) in ${ batches . length } batch(es)... ` ) ;
const markdownParts : string [ ] = [ ] ;
for ( let i = 0 ; i < batches . length ; i ++ ) {
const batch = batches [ i ] ;
const batchTokens = batch . reduce ( ( sum , img ) = > sum + estimateVisualTokens ( img . width , img . height ) , 0 ) ;
console . log ( ` Batch ${ i + 1 } : ${ batch . length } page(s), ~ ${ batchTokens } tokens ` ) ;
const markdown = await convertBatchToMarkdown ( batch ) ;
markdownParts . push ( markdown ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-19 11:51:23 +00:00
const fullMarkdown = markdownParts . join ( '\n\n' ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Complete: ${ fullMarkdown . length } chars total ` ) ;
2026-01-18 15:54:16 +00:00
return fullMarkdown ;
}
2026-01-18 23:00:24 +00:00
/ * *
* Stop Nanonets container
* /
function stopNanonets ( ) : void {
console . log ( ' [Docker] Stopping Nanonets container...' ) ;
try {
execSync ( 'docker stop nanonets-test 2>/dev/null || true' , { stdio : 'pipe' } ) ;
execSync ( 'sleep 5' , { stdio : 'pipe' } ) ;
console . log ( ' [Docker] Nanonets stopped' ) ;
} catch {
console . log ( ' [Docker] Nanonets was not running' ) ;
}
}
2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Ensure GPT - OSS 20 B model is available
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function ensureExtractionModel ( ) : Promise < boolean > {
2026-01-18 15:54:16 +00:00
try {
const response = await fetch ( ` ${ OLLAMA_URL } /api/tags ` ) ;
if ( response . ok ) {
const data = await response . json ( ) ;
const models = data . models || [ ] ;
2026-01-19 11:51:23 +00:00
if ( models . some ( ( m : { name : string } ) = > m . name === EXTRACTION_MODEL ) ) {
console . log ( ` [Ollama] Model available: ${ EXTRACTION_MODEL } ` ) ;
2026-01-18 15:54:16 +00:00
return true ;
}
}
} catch {
return false ;
}
2026-01-19 11:51:23 +00:00
console . log ( ` [Ollama] Pulling ${ EXTRACTION_MODEL } ... ` ) ;
2026-01-18 15:54:16 +00:00
const pullResponse = await fetch ( ` ${ OLLAMA_URL } /api/pull ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
2026-01-19 11:51:23 +00:00
body : JSON.stringify ( { name : EXTRACTION_MODEL , stream : false } ) ,
2026-01-18 15:54:16 +00:00
} ) ;
return pullResponse . ok ;
}
/ * *
* Parse amount from string ( handles European format )
* /
function parseAmount ( s : string | number | undefined ) : number {
if ( s === undefined || s === null ) return 0 ;
if ( typeof s === 'number' ) return s ;
const match = s . match ( /([\d.,]+)/ ) ;
if ( ! match ) return 0 ;
const numStr = match [ 1 ] ;
const normalized = numStr . includes ( ',' ) && numStr . indexOf ( ',' ) > numStr . lastIndexOf ( '.' )
? numStr . replace ( /\./g , '' ) . replace ( ',' , '.' )
: numStr . replace ( /,/g , '' ) ;
return parseFloat ( normalized ) || 0 ;
}
/ * *
* Extract invoice number from potentially verbose response
* /
function extractInvoiceNumber ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const patterns = [
2026-01-18 23:00:24 +00:00
/\b([A-Z]{2,3}\d{10,})\b/i ,
/\b([A-Z]\d{8,})\b/i ,
/\b(INV[-\s]?\d{4}[-\s]?\d+)\b/i ,
/\b(\d{7,})\b/ ,
2026-01-18 15:54:16 +00:00
] ;
for ( const pattern of patterns ) {
const match = clean . match ( pattern ) ;
if ( match ) return match [ 1 ] ;
}
return clean . replace ( /[^A-Z0-9-]/gi , '' ) . trim ( ) || clean ;
}
/ * *
* Extract date ( YYYY - MM - DD ) from response
* /
function extractDate ( s : string | undefined ) : string {
if ( ! s ) return '' ;
let clean = s . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ;
const isoMatch = clean . match ( /(\d{4}-\d{2}-\d{2})/ ) ;
if ( isoMatch ) return isoMatch [ 1 ] ;
const dmyMatch = clean . match ( /(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})/ ) ;
if ( dmyMatch ) {
return ` ${ dmyMatch [ 3 ] } - ${ dmyMatch [ 2 ] . padStart ( 2 , '0' ) } - ${ dmyMatch [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return clean . replace ( /[^\d-]/g , '' ) . trim ( ) ;
}
/ * *
* Extract currency
* /
function extractCurrency ( s : string | undefined ) : string {
if ( ! s ) return 'EUR' ;
const upper = s . toUpperCase ( ) ;
if ( upper . includes ( 'EUR' ) || upper . includes ( '€' ) ) return 'EUR' ;
if ( upper . includes ( 'USD' ) || upper . includes ( '$' ) ) return 'USD' ;
if ( upper . includes ( 'GBP' ) || upper . includes ( '£' ) ) return 'GBP' ;
return 'EUR' ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract JSON from response
2026-01-18 15:54:16 +00:00
* /
function extractJsonFromResponse ( response : string ) : Record < string , unknown > | null {
let cleanResponse = response . replace ( /<think>[\s\S]*?<\/think>/g , '' ) . trim ( ) ;
const codeBlockMatch = cleanResponse . match ( /```(?:json)?\s*([\s\S]*?)```/ ) ;
const jsonStr = codeBlockMatch ? codeBlockMatch [ 1 ] . trim ( ) : cleanResponse ;
try {
return JSON . parse ( jsonStr ) ;
} catch {
const jsonMatch = jsonStr . match ( /\{[\s\S]*\}/ ) ;
if ( jsonMatch ) {
try {
return JSON . parse ( jsonMatch [ 0 ] ) ;
} catch {
return null ;
}
}
return null ;
}
}
/ * *
* Parse JSON response into IInvoice
* /
function parseJsonToInvoice ( response : string ) : IInvoice | null {
const parsed = extractJsonFromResponse ( response ) ;
if ( ! parsed ) return null ;
return {
invoice_number : extractInvoiceNumber ( String ( parsed . invoice_number || '' ) ) ,
invoice_date : extractDate ( String ( parsed . invoice_date || '' ) ) ,
vendor_name : String ( parsed . vendor_name || '' ) . replace ( /\*\*/g , '' ) . replace ( /`/g , '' ) . trim ( ) ,
currency : extractCurrency ( String ( parsed . currency || '' ) ) ,
net_amount : parseAmount ( parsed . net_amount as string | number ) ,
vat_amount : parseAmount ( parsed . vat_amount as string | number ) ,
total_amount : parseAmount ( parsed . total_amount as string | number ) ,
} ;
}
/ * *
2026-01-19 11:51:23 +00:00
* Extract invoice from markdown using GPT - OSS 20 B ( streaming )
2026-01-18 15:54:16 +00:00
* /
async function extractInvoiceFromMarkdown ( markdown : string , queryId : string ) : Promise < IInvoice | null > {
const startTime = Date . now ( ) ;
2026-01-19 11:51:23 +00:00
const fullPrompt = JSON_EXTRACTION_PROMPT + markdown ;
// Log exact prompt
console . log ( ` \ n [ ${ queryId } ] ===== PROMPT ===== ` ) ;
console . log ( fullPrompt ) ;
console . log ( ` [ ${ queryId } ] ===== END PROMPT ( ${ fullPrompt . length } chars) ===== \ n ` ) ;
2026-01-18 15:54:16 +00:00
const response = await fetch ( ` ${ OLLAMA_URL } /api/chat ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( {
2026-01-19 11:51:23 +00:00
model : EXTRACTION_MODEL ,
messages : [
{ role : 'user' , content : 'Hi there, how are you?' } ,
{ role : 'assistant' , content : 'Good, how can I help you today?' } ,
{ role : 'user' , content : fullPrompt } ,
] ,
stream : true ,
2026-01-18 15:54:16 +00:00
} ) ,
2026-01-19 11:51:23 +00:00
signal : AbortSignal.timeout ( 600000 ) , // 10 minute timeout for large documents
2026-01-18 15:54:16 +00:00
} ) ;
if ( ! response . ok ) {
2026-01-19 11:51:23 +00:00
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` [ ${ queryId } ] ERROR: ${ response . status } ( ${ elapsed } s) ` ) ;
throw new Error ( ` Ollama API error: ${ response . status } ` ) ;
}
2026-01-19 11:51:23 +00:00
// Stream the response
let content = '' ;
let thinkingContent = '' ;
let thinkingStarted = false ;
let outputStarted = false ;
const reader = response . body ! . getReader ( ) ;
const decoder = new TextDecoder ( ) ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
try {
while ( true ) {
const { done , value } = await reader . read ( ) ;
if ( done ) break ;
const chunk = decoder . decode ( value , { stream : true } ) ;
// Each line is a JSON object
for ( const line of chunk . split ( '\n' ) . filter ( l = > l . trim ( ) ) ) {
try {
const json = JSON . parse ( line ) ;
// Stream thinking tokens
const thinking = json . message ? . thinking || '' ;
if ( thinking ) {
if ( ! thinkingStarted ) {
process . stdout . write ( ` [ ${ queryId } ] THINKING: ` ) ;
thinkingStarted = true ;
}
process . stdout . write ( thinking ) ;
thinkingContent += thinking ;
}
// Stream content tokens
const token = json . message ? . content || '' ;
if ( token ) {
if ( ! outputStarted ) {
if ( thinkingStarted ) process . stdout . write ( '\n' ) ;
process . stdout . write ( ` [ ${ queryId } ] OUTPUT: ` ) ;
outputStarted = true ;
}
process . stdout . write ( token ) ;
content += token ;
}
} catch {
// Ignore parse errors for partial chunks
}
}
}
} finally {
if ( thinkingStarted || outputStarted ) process . stdout . write ( '\n' ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
console . log ( ` [ ${ queryId } ] Done: ${ thinkingContent . length } thinking chars, ${ content . length } output chars ( ${ elapsed } s) ` ) ;
return parseJsonToInvoice ( content ) ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-19 11:51:23 +00:00
* Extract invoice ( single pass - GPT - OSS is more reliable )
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function extractInvoice ( markdown : string , docName : string ) : Promise < IInvoice > {
console . log ( ` [ ${ docName } ] Extracting... ` ) ;
const invoice = await extractInvoiceFromMarkdown ( markdown , docName ) ;
if ( ! invoice ) {
return {
invoice_number : '' ,
invoice_date : '' ,
vendor_name : '' ,
currency : 'EUR' ,
net_amount : 0 ,
vat_amount : 0 ,
total_amount : 0 ,
} ;
2026-01-18 15:54:16 +00:00
}
2026-01-19 11:51:23 +00:00
console . log ( ` [ ${ docName } ] Extracted: ${ invoice . invoice_number } ` ) ;
return invoice ;
2026-01-18 15:54:16 +00:00
}
/ * *
* Normalize date to YYYY - MM - DD
* /
function normalizeDate ( dateStr : string | null ) : string {
if ( ! dateStr ) return '' ;
if ( /^\d{4}-\d{2}-\d{2}$/ . test ( dateStr ) ) return dateStr ;
const monthMap : Record < string , string > = {
JAN : '01' , FEB : '02' , MAR : '03' , APR : '04' , MAY : '05' , JUN : '06' ,
JUL : '07' , AUG : '08' , SEP : '09' , OCT : '10' , NOV : '11' , DEC : '12' ,
} ;
let match = dateStr . match ( /^(\d{1,2})-([A-Z]{3})-(\d{4})$/i ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ monthMap [ match [ 2 ] . toUpperCase ( ) ] || '01' } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
match = dateStr . match ( /^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/ ) ;
if ( match ) {
return ` ${ match [ 3 ] } - ${ match [ 2 ] . padStart ( 2 , '0' ) } - ${ match [ 1 ] . padStart ( 2 , '0' ) } ` ;
}
return dateStr ;
}
/ * *
* Compare extracted invoice against expected
* /
function compareInvoice (
extracted : IInvoice ,
expected : IInvoice
) : { match : boolean ; errors : string [ ] } {
const errors : string [ ] = [ ] ;
const extNum = extracted . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
const expNum = expected . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
if ( extNum !== expNum ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_number: exp " ${ expected . invoice_number } ", got " ${ extracted . invoice_number } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( normalizeDate ( extracted . invoice_date ) !== normalizeDate ( expected . invoice_date ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` invoice_date: exp " ${ expected . invoice_date } ", got " ${ extracted . invoice_date } " ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( Math . abs ( extracted . total_amount - expected . total_amount ) > 0.02 ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` total_amount: exp ${ expected . total_amount } , got ${ extracted . total_amount } ` ) ;
2026-01-18 15:54:16 +00:00
}
if ( extracted . currency ? . toUpperCase ( ) !== expected . currency ? . toUpperCase ( ) ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` currency: exp " ${ expected . currency } ", got " ${ extracted . currency } " ` ) ;
2026-01-18 15:54:16 +00:00
}
return { match : errors.length === 0 , errors } ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Find all test cases
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
function findTestCases ( ) : ITestCase [ ] {
2026-01-18 15:54:16 +00:00
const testDir = path . join ( process . cwd ( ) , '.nogit/invoices' ) ;
2026-01-18 23:00:24 +00:00
if ( ! fs . existsSync ( testDir ) ) return [ ] ;
2026-01-18 15:54:16 +00:00
const files = fs . readdirSync ( testDir ) ;
2026-01-18 23:00:24 +00:00
const testCases : ITestCase [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
for ( const pdf of files . filter ( ( f ) = > f . endsWith ( '.pdf' ) ) ) {
2026-01-18 15:54:16 +00:00
const baseName = pdf . replace ( '.pdf' , '' ) ;
const jsonFile = ` ${ baseName } .json ` ;
if ( files . includes ( jsonFile ) ) {
testCases . push ( {
name : baseName ,
pdfPath : path.join ( testDir , pdf ) ,
jsonPath : path.join ( testDir , jsonFile ) ,
} ) ;
}
}
2026-01-18 23:00:24 +00:00
return testCases . sort ( ( a , b ) = > a . name . localeCompare ( b . name ) ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-18 23:00:24 +00:00
// ============ TESTS ============
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const testCases = findTestCases ( ) ;
console . log ( ` \ nFound ${ testCases . length } invoice test cases \ n ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Ensure temp directory exists
if ( ! fs . existsSync ( TEMP_MD_DIR ) ) {
fs . mkdirSync ( TEMP_MD_DIR , { recursive : true } ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// -------- STAGE 1: OCR with Nanonets --------
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Setup Nanonets' , async ( ) = > {
console . log ( '\n========== STAGE 1: Nanonets OCR ==========\n' ) ;
const ok = await ensureNanonetsOcr ( ) ;
expect ( ok ) . toBeTrue ( ) ;
} ) ;
tap . test ( 'Stage 1: Convert all invoices to markdown' , async ( ) = > {
console . log ( '\n Converting all invoice PDFs to markdown with Nanonets-OCR-s...\n' ) ;
for ( const tc of testCases ) {
console . log ( ` \ n === ${ tc . name } === ` ) ;
const images = convertPdfToImages ( tc . pdfPath ) ;
console . log ( ` Pages: ${ images . length } ` ) ;
const markdown = await convertDocumentToMarkdown ( images , tc . name ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
fs . writeFileSync ( mdPath , markdown ) ;
tc . markdownPath = mdPath ;
console . log ( ` Saved: ${ mdPath } ` ) ;
}
console . log ( '\n Stage 1 complete: All invoices converted to markdown\n' ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Stop Nanonets' , async ( ) = > {
stopNanonets ( ) ;
await new Promise ( resolve = > setTimeout ( resolve , 3000 ) ) ;
expect ( isContainerRunning ( 'nanonets-test' ) ) . toBeFalse ( ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-19 11:51:23 +00:00
// -------- STAGE 2: Extraction with GPT-OSS 20B --------
2026-01-18 23:00:24 +00:00
2026-01-19 11:51:23 +00:00
tap . test ( 'Stage 2: Setup Ollama + GPT-OSS 20B' , async ( ) = > {
console . log ( '\n========== STAGE 2: GPT-OSS 20B Extraction ==========\n' ) ;
2026-01-18 23:00:24 +00:00
const ollamaOk = await ensureMiniCpm ( ) ;
expect ( ollamaOk ) . toBeTrue ( ) ;
2026-01-19 11:51:23 +00:00
const extractionOk = await ensureExtractionModel ( ) ;
expect ( extractionOk ) . toBeTrue ( ) ;
2026-01-18 23:00:24 +00:00
} ) ;
2026-01-18 15:54:16 +00:00
let passedCount = 0 ;
let failedCount = 0 ;
const processingTimes : number [ ] = [ ] ;
2026-01-18 23:00:24 +00:00
for ( const tc of testCases ) {
tap . test ( ` Stage 2: Extract ${ tc . name } ` , async ( ) = > {
const expected : IInvoice = JSON . parse ( fs . readFileSync ( tc . jsonPath , 'utf-8' ) ) ;
console . log ( ` \ n === ${ tc . name } === ` ) ;
console . log ( ` Expected: ${ expected . invoice_number } | ${ expected . invoice_date } | ${ expected . total_amount } ${ expected . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-18 23:00:24 +00:00
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
if ( ! fs . existsSync ( mdPath ) ) {
throw new Error ( ` Markdown not found: ${ mdPath } . Run Stage 1 first. ` ) ;
}
const markdown = fs . readFileSync ( mdPath , 'utf-8' ) ;
console . log ( ` Markdown: ${ markdown . length } chars ` ) ;
2026-01-19 11:51:23 +00:00
const extracted = await extractInvoice ( markdown , tc . name ) ;
2026-01-18 15:54:16 +00:00
const elapsedMs = Date . now ( ) - startTime ;
processingTimes . push ( elapsedMs ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Extracted: ${ extracted . invoice_number } | ${ extracted . invoice_date } | ${ extracted . total_amount } ${ extracted . currency } ` ) ;
2026-01-18 15:54:16 +00:00
const result = compareInvoice ( extracted , expected ) ;
if ( result . match ) {
passedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
2026-01-18 15:54:16 +00:00
} else {
failedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: MISMATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
result . errors . forEach ( e = > console . log ( ` - ${ e } ` ) ) ;
2026-01-18 15:54:16 +00:00
}
expect ( result . match ) . toBeTrue ( ) ;
} ) ;
}
2026-01-18 23:00:24 +00:00
tap . test ( 'Summary' , async ( ) = > {
2026-01-18 15:54:16 +00:00
const totalInvoices = testCases . length ;
const accuracy = totalInvoices > 0 ? ( passedCount / totalInvoices ) * 100 : 0 ;
const totalTimeMs = processingTimes . reduce ( ( a , b ) = > a + b , 0 ) ;
const avgTimeSec = processingTimes . length > 0 ? totalTimeMs / processingTimes . length / 1000 : 0 ;
console . log ( ` \ n======================================== ` ) ;
2026-01-19 11:51:23 +00:00
console . log ( ` Invoice Summary (Nanonets + GPT-OSS 20B) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` ======================================== ` ) ;
console . log ( ` Stage 1: Nanonets-OCR-s (doc -> md) ` ) ;
2026-01-19 11:51:23 +00:00
console . log ( ` Stage 2: GPT-OSS 20B (md -> JSON) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` Passed: ${ passedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Failed: ${ failedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Accuracy: ${ accuracy . toFixed ( 1 ) } % ` ) ;
console . log ( ` ---------------------------------------- ` ) ;
console . log ( ` Total time: ${ ( totalTimeMs / 1000 ) . toFixed ( 1 ) } s ` ) ;
console . log ( ` Avg per inv: ${ avgTimeSec . toFixed ( 1 ) } s ` ) ;
console . log ( ` ======================================== \ n ` ) ;
2026-01-18 23:00:24 +00:00
// Cleanup temp files
try {
fs . rmSync ( TEMP_MD_DIR , { recursive : true , force : true } ) ;
console . log ( ` Cleaned up temp directory: ${ TEMP_MD_DIR } \ n ` ) ;
} catch {
// Ignore
}
2026-01-18 15:54:16 +00:00
} ) ;
export default tap . start ( ) ;