2026-01-18 15:54:16 +00:00
/ * *
2026-01-18 23:00:24 +00:00
* Bank statement extraction using Nanonets - OCR - s + GPT - OSS 20 B ( sequential two - stage pipeline )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* Stage 1 : Nanonets - OCR - s converts ALL document pages to markdown ( stop after completion )
* Stage 2 : GPT - OSS 20 B extracts structured JSON from saved markdown ( after Nanonets stops )
2026-01-18 15:54:16 +00:00
*
2026-01-18 23:00:24 +00:00
* This approach avoids GPU contention by running services sequentially .
2026-01-18 15:54:16 +00:00
* /
import { tap , expect } from '@git.zone/tstest/tapbundle' ;
import * as fs from 'fs' ;
import * as path from 'path' ;
import { execSync } from 'child_process' ;
import * as os from 'os' ;
2026-01-18 23:00:24 +00:00
import { ensureNanonetsOcr , ensureMiniCpm , removeContainer , isContainerRunning } from './helpers/docker.js' ;
2026-01-18 15:54:16 +00:00
const NANONETS_URL = 'http://localhost:8000/v1' ;
const NANONETS_MODEL = 'nanonets/Nanonets-OCR-s' ;
const OLLAMA_URL = 'http://localhost:11434' ;
2026-01-18 23:00:24 +00:00
const EXTRACTION_MODEL = 'gpt-oss:20b' ;
// Temp directory for storing markdown between stages
const TEMP_MD_DIR = path . join ( os . tmpdir ( ) , 'nanonets-markdown' ) ;
2026-01-18 15:54:16 +00:00
interface ITransaction {
date : string ;
counterparty : string ;
amount : number ;
}
2026-01-19 11:51:23 +00:00
interface IImageData {
base64 : string ;
width : number ;
height : number ;
pageNum : number ;
}
2026-01-18 23:00:24 +00:00
interface ITestCase {
name : string ;
pdfPath : string ;
jsonPath : string ;
markdownPath? : string ;
2026-01-19 11:51:23 +00:00
images? : IImageData [ ] ;
2026-01-18 23:00:24 +00:00
}
2026-01-18 15:54:16 +00:00
// Nanonets-specific prompt for document OCR to markdown
const NANONETS_OCR_PROMPT = ` Extract the text from the above document as if you were reading it naturally.
Return the tables in html format .
Return the equations in LaTeX representation .
If there is an image in the document and image caption is not present , add a small description inside < img > < / img > tag .
Watermarks should be wrapped in brackets . Ex : < watermark > OFFICIAL COPY < / watermark > .
Page numbers should be wrapped in brackets . Ex : < page_number > 14 < / page_number > . ` ;
2026-01-18 23:00:24 +00:00
// JSON extraction prompt for GPT-OSS 20B
const JSON_EXTRACTION_PROMPT = ` Extract ALL transactions from this bank statement as JSON array. Each transaction: {"date": "YYYY-MM-DD", "counterparty": "NAME", "amount": -25.99}. Amount negative for debits, positive for credits. Only include actual transactions, not balances. Return ONLY JSON array, no explanation.
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
STATEMENT :
2026-01-18 15:54:16 +00:00
` ;
2026-01-19 11:51:23 +00:00
// Constants for smart batching
const MAX_VISUAL_TOKENS = 28000 ; // ~32K context minus prompt/output headroom
const PATCH_SIZE = 14 ; // Qwen2.5-VL uses 14x14 patches
/ * *
* Estimate visual tokens for an image based on dimensions
* /
function estimateVisualTokens ( width : number , height : number ) : number {
return Math . ceil ( ( width * height ) / ( PATCH_SIZE * PATCH_SIZE ) ) ;
}
/ * *
* Batch images to fit within context window
* /
function batchImages ( images : IImageData [ ] ) : IImageData [ ] [ ] {
const batches : IImageData [ ] [ ] = [ ] ;
let currentBatch : IImageData [ ] = [ ] ;
let currentTokens = 0 ;
for ( const img of images ) {
const imgTokens = estimateVisualTokens ( img . width , img . height ) ;
if ( currentTokens + imgTokens > MAX_VISUAL_TOKENS && currentBatch . length > 0 ) {
batches . push ( currentBatch ) ;
currentBatch = [ img ] ;
currentTokens = imgTokens ;
} else {
currentBatch . push ( img ) ;
currentTokens += imgTokens ;
}
}
if ( currentBatch . length > 0 ) batches . push ( currentBatch ) ;
return batches ;
}
2026-01-18 15:54:16 +00:00
/ * *
2026-01-19 11:51:23 +00:00
* Convert PDF to JPEG images using ImageMagick with dimension tracking
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
function convertPdfToImages ( pdfPath : string ) : IImageData [ ] {
2026-01-18 15:54:16 +00:00
const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'pdf-convert-' ) ) ;
2026-01-19 11:51:23 +00:00
const outputPattern = path . join ( tempDir , 'page-%d.jpg' ) ;
2026-01-18 15:54:16 +00:00
try {
execSync (
` convert -density 150 -quality 90 " ${ pdfPath } " -background white -alpha remove " ${ outputPattern } " ` ,
{ stdio : 'pipe' }
) ;
2026-01-19 11:51:23 +00:00
const files = fs . readdirSync ( tempDir ) . filter ( ( f : string ) = > f . endsWith ( '.jpg' ) ) . sort ( ) ;
const images : IImageData [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-19 11:51:23 +00:00
for ( let i = 0 ; i < files . length ; i ++ ) {
const file = files [ i ] ;
2026-01-18 15:54:16 +00:00
const imagePath = path . join ( tempDir , file ) ;
const imageData = fs . readFileSync ( imagePath ) ;
2026-01-19 11:51:23 +00:00
// Get image dimensions using identify command
const dimensions = execSync ( ` identify -format "%w %h" " ${ imagePath } " ` , { encoding : 'utf-8' } ) . trim ( ) ;
const [ width , height ] = dimensions . split ( ' ' ) . map ( Number ) ;
images . push ( {
base64 : imageData.toString ( 'base64' ) ,
width ,
height ,
pageNum : i + 1 ,
} ) ;
2026-01-18 15:54:16 +00:00
}
return images ;
} finally {
fs . rmSync ( tempDir , { recursive : true , force : true } ) ;
}
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert a batch of pages to markdown using Nanonets - OCR - s
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertBatchToMarkdown ( batch : IImageData [ ] ) : Promise < string > {
2026-01-18 15:54:16 +00:00
const startTime = Date . now ( ) ;
2026-01-19 11:51:23 +00:00
const pageNums = batch . map ( img = > img . pageNum ) . join ( ', ' ) ;
// Build content array with all images first, then the prompt
const content : Array < { type : string ; image_url ? : { url : string } ; text? : string } > = [ ] ;
for ( const img of batch ) {
content . push ( {
type : 'image_url' ,
image_url : { url : ` data:image/jpeg;base64, ${ img . base64 } ` } ,
} ) ;
}
// Add prompt with page separator instruction if multiple pages
const promptText = batch . length > 1
? ` ${ NANONETS_OCR_PROMPT } \ n \ nPlease clearly separate each page's content with "--- PAGE N ---" markers, where N is the page number starting from ${ batch [ 0 ] . pageNum } . `
: NANONETS_OCR_PROMPT ;
content . push ( { type : 'text' , text : promptText } ) ;
2026-01-18 15:54:16 +00:00
const response = await fetch ( ` ${ NANONETS_URL } /chat/completions ` , {
method : 'POST' ,
headers : {
'Content-Type' : 'application/json' ,
'Authorization' : 'Bearer dummy' ,
} ,
body : JSON.stringify ( {
model : NANONETS_MODEL ,
messages : [ {
role : 'user' ,
2026-01-19 11:51:23 +00:00
content ,
2026-01-18 15:54:16 +00:00
} ] ,
2026-01-19 11:51:23 +00:00
max_tokens : 4096 * batch . length , // Scale output tokens with batch size
2026-01-18 15:54:16 +00:00
temperature : 0.0 ,
} ) ,
} ) ;
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
const errorText = await response . text ( ) ;
2026-01-18 23:00:24 +00:00
throw new Error ( ` Nanonets API error: ${ response . status } - ${ errorText } ` ) ;
2026-01-18 15:54:16 +00:00
}
const data = await response . json ( ) ;
2026-01-19 11:51:23 +00:00
let responseContent = ( data . choices ? . [ 0 ] ? . message ? . content || '' ) . trim ( ) ;
// For single-page batches, add page marker if not present
if ( batch . length === 1 && ! responseContent . includes ( '--- PAGE' ) ) {
responseContent = ` --- PAGE ${ batch [ 0 ] . pageNum } --- \ n ${ responseContent } ` ;
}
console . log ( ` Pages [ ${ pageNums } ]: ${ responseContent . length } chars ( ${ elapsed } s) ` ) ;
return responseContent ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-19 11:51:23 +00:00
* Convert all pages of a document to markdown using smart batching
2026-01-18 15:54:16 +00:00
* /
2026-01-19 11:51:23 +00:00
async function convertDocumentToMarkdown ( images : IImageData [ ] , docName : string ) : Promise < string > {
const batches = batchImages ( images ) ;
console . log ( ` [ ${ docName } ] Processing ${ images . length } page(s) in ${ batches . length } batch(es)... ` ) ;
const markdownParts : string [ ] = [ ] ;
for ( let i = 0 ; i < batches . length ; i ++ ) {
const batch = batches [ i ] ;
const batchTokens = batch . reduce ( ( sum , img ) = > sum + estimateVisualTokens ( img . width , img . height ) , 0 ) ;
console . log ( ` Batch ${ i + 1 } : ${ batch . length } page(s), ~ ${ batchTokens } tokens ` ) ;
const markdown = await convertBatchToMarkdown ( batch ) ;
markdownParts . push ( markdown ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-19 11:51:23 +00:00
const fullMarkdown = markdownParts . join ( '\n\n' ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ docName } ] Complete: ${ fullMarkdown . length } chars total ` ) ;
2026-01-18 15:54:16 +00:00
return fullMarkdown ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Stop Nanonets container
* /
function stopNanonets ( ) : void {
console . log ( ' [Docker] Stopping Nanonets container...' ) ;
try {
execSync ( 'docker stop nanonets-test 2>/dev/null || true' , { stdio : 'pipe' } ) ;
// Wait for GPU memory to be released
execSync ( 'sleep 5' , { stdio : 'pipe' } ) ;
console . log ( ' [Docker] Nanonets stopped' ) ;
} catch {
console . log ( ' [Docker] Nanonets was not running' ) ;
}
}
/ * *
* Ensure GPT - OSS 20 B model is available and warmed up
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
async function ensureExtractionModel ( ) : Promise < boolean > {
2026-01-18 15:54:16 +00:00
try {
const response = await fetch ( ` ${ OLLAMA_URL } /api/tags ` ) ;
if ( response . ok ) {
const data = await response . json ( ) ;
const models = data . models || [ ] ;
2026-01-18 23:00:24 +00:00
if ( models . some ( ( m : { name : string } ) = > m . name === EXTRACTION_MODEL ) ) {
console . log ( ` [Ollama] Model available: ${ EXTRACTION_MODEL } ` ) ;
2026-01-18 15:54:16 +00:00
return true ;
}
}
} catch {
return false ;
}
2026-01-18 23:00:24 +00:00
console . log ( ` [Ollama] Pulling ${ EXTRACTION_MODEL } ... ` ) ;
2026-01-18 15:54:16 +00:00
const pullResponse = await fetch ( ` ${ OLLAMA_URL } /api/pull ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
2026-01-18 23:00:24 +00:00
body : JSON.stringify ( { name : EXTRACTION_MODEL , stream : false } ) ,
2026-01-18 15:54:16 +00:00
} ) ;
return pullResponse . ok ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract transactions from markdown using GPT - OSS 20 B ( streaming )
2026-01-18 15:54:16 +00:00
* /
async function extractTransactionsFromMarkdown ( markdown : string , queryId : string ) : Promise < ITransaction [ ] > {
const startTime = Date . now ( ) ;
2026-01-18 23:00:24 +00:00
const fullPrompt = JSON_EXTRACTION_PROMPT + markdown ;
2026-01-19 11:51:23 +00:00
// Log exact prompt
console . log ( ` \ n [ ${ queryId } ] ===== PROMPT ===== ` ) ;
console . log ( fullPrompt ) ;
console . log ( ` [ ${ queryId } ] ===== END PROMPT ( ${ fullPrompt . length } chars) ===== \ n ` ) ;
2026-01-18 23:00:24 +00:00
2026-01-18 15:54:16 +00:00
const response = await fetch ( ` ${ OLLAMA_URL } /api/chat ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( {
2026-01-18 23:00:24 +00:00
model : EXTRACTION_MODEL ,
2026-01-19 11:51:23 +00:00
messages : [
{ role : 'user' , content : 'Hi there, how are you?' } ,
{ role : 'assistant' , content : 'Good, how can I help you today?' } ,
{ role : 'user' , content : fullPrompt } ,
] ,
2026-01-18 23:00:24 +00:00
stream : true ,
2026-01-18 15:54:16 +00:00
} ) ,
2026-01-18 23:00:24 +00:00
signal : AbortSignal.timeout ( 600000 ) , // 10 minute timeout
2026-01-18 15:54:16 +00:00
} ) ;
if ( ! response . ok ) {
2026-01-18 23:00:24 +00:00
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` [ ${ queryId } ] ERROR: ${ response . status } ( ${ elapsed } s) ` ) ;
throw new Error ( ` Ollama API error: ${ response . status } ` ) ;
}
2026-01-19 11:51:23 +00:00
// Stream the response
2026-01-18 23:00:24 +00:00
let content = '' ;
2026-01-19 11:51:23 +00:00
let thinkingContent = '' ;
let thinkingStarted = false ;
let outputStarted = false ;
2026-01-18 23:00:24 +00:00
const reader = response . body ! . getReader ( ) ;
const decoder = new TextDecoder ( ) ;
2026-01-19 11:51:23 +00:00
try {
while ( true ) {
const { done , value } = await reader . read ( ) ;
if ( done ) break ;
const chunk = decoder . decode ( value , { stream : true } ) ;
// Each line is a JSON object
for ( const line of chunk . split ( '\n' ) . filter ( l = > l . trim ( ) ) ) {
try {
const json = JSON . parse ( line ) ;
// Stream thinking tokens
const thinking = json . message ? . thinking || '' ;
if ( thinking ) {
if ( ! thinkingStarted ) {
process . stdout . write ( ` [ ${ queryId } ] THINKING: ` ) ;
thinkingStarted = true ;
}
process . stdout . write ( thinking ) ;
thinkingContent += thinking ;
}
// Stream content tokens
const token = json . message ? . content || '' ;
if ( token ) {
if ( ! outputStarted ) {
if ( thinkingStarted ) process . stdout . write ( '\n' ) ;
process . stdout . write ( ` [ ${ queryId } ] OUTPUT: ` ) ;
outputStarted = true ;
}
process . stdout . write ( token ) ;
content += token ;
}
} catch {
// Ignore parse errors for partial chunks
2026-01-18 23:00:24 +00:00
}
}
}
2026-01-19 11:51:23 +00:00
} finally {
if ( thinkingStarted || outputStarted ) process . stdout . write ( '\n' ) ;
2026-01-18 23:00:24 +00:00
}
const elapsed = ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) ;
2026-01-19 11:51:23 +00:00
console . log ( ` [ ${ queryId } ] Done: ${ thinkingContent . length } thinking chars, ${ content . length } output chars ( ${ elapsed } s) ` ) ;
2026-01-18 15:54:16 +00:00
return parseJsonResponse ( content , queryId ) ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Sanitize JSON string
2026-01-18 15:54:16 +00:00
* /
function sanitizeJson ( jsonStr : string ) : string {
let s = jsonStr ;
s = s . replace ( /"amount"\s*:\s*\+/g , '"amount": ' ) ;
s = s . replace ( /:\s*\+(\d)/g , ': $1' ) ;
s = s . replace ( /"amount"\s*:\s*(-?)(\d{1,3})\.(\d{3})\.(\d{2})\b/g , '"amount": $1$2$3.$4' ) ;
s = s . replace ( /,\s*([}\]])/g , '$1' ) ;
s = s . replace ( /"([^"\\]*)\n([^"]*)"/g , '"$1 $2"' ) ;
s = s . replace ( /"([^"\\]*)\t([^"]*)"/g , '"$1 $2"' ) ;
s = s . replace ( /[\x00-\x08\x0B\x0C\x0E-\x1F]/g , ' ' ) ;
return s ;
}
/ * *
* Parse amount from various formats
* /
function parseAmount ( value : unknown ) : number {
if ( typeof value === 'number' ) return value ;
if ( typeof value !== 'string' ) return 0 ;
let s = value . replace ( /[€$£\s]/g , '' ) . replace ( '− ' , '-' ) . replace ( '– ' , '-' ) ;
if ( s . includes ( ',' ) && s . indexOf ( ',' ) > s . lastIndexOf ( '.' ) ) {
s = s . replace ( /\./g , '' ) . replace ( ',' , '.' ) ;
} else {
s = s . replace ( /,/g , '' ) ;
}
return parseFloat ( s ) || 0 ;
}
/ * *
* Parse JSON response into transactions
* /
function parseJsonResponse ( response : string , queryId : string ) : ITransaction [ ] {
2026-01-18 23:00:24 +00:00
// Remove thinking tags if present
2026-01-18 15:54:16 +00:00
let cleanResponse = response . replace ( /<think>[\s\S]*?<\/think>/g , '' ) . trim ( ) ;
2026-01-18 23:00:24 +00:00
// Debug: show what we're working with
console . log ( ` [ ${ queryId } ] Response preview: ${ cleanResponse . substring ( 0 , 300 ) } ... ` ) ;
2026-01-18 15:54:16 +00:00
const codeBlockMatch = cleanResponse . match ( /```(?:json)?\s*([\s\S]*?)```/ ) ;
let jsonStr = codeBlockMatch ? codeBlockMatch [ 1 ] . trim ( ) : cleanResponse ;
jsonStr = sanitizeJson ( jsonStr ) ;
try {
const parsed = JSON . parse ( jsonStr ) ;
if ( Array . isArray ( parsed ) ) {
const txs = parsed . map ( tx = > ( {
date : String ( tx . date || '' ) ,
counterparty : String ( tx . counterparty || tx . description || '' ) ,
amount : parseAmount ( tx . amount ) ,
} ) ) ;
console . log ( ` [ ${ queryId } ] Parsed ${ txs . length } transactions ` ) ;
return txs ;
}
} catch ( e ) {
2026-01-18 23:00:24 +00:00
// Try to find a JSON array in the text
2026-01-18 15:54:16 +00:00
const arrayMatch = jsonStr . match ( /\[[\s\S]*\]/ ) ;
if ( arrayMatch ) {
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ queryId } ] Array match found: ${ arrayMatch [ 0 ] . length } chars ` ) ;
2026-01-18 15:54:16 +00:00
try {
const parsed = JSON . parse ( sanitizeJson ( arrayMatch [ 0 ] ) ) ;
if ( Array . isArray ( parsed ) ) {
const txs = parsed . map ( tx = > ( {
date : String ( tx . date || '' ) ,
counterparty : String ( tx . counterparty || tx . description || '' ) ,
amount : parseAmount ( tx . amount ) ,
} ) ) ;
console . log ( ` [ ${ queryId } ] Parsed ${ txs . length } transactions (array match) ` ) ;
return txs ;
}
2026-01-18 23:00:24 +00:00
} catch ( innerErr ) {
console . log ( ` [ ${ queryId } ] Array parse error: ${ ( innerErr as Error ) . message } ` ) ;
2026-01-18 15:54:16 +00:00
}
2026-01-18 23:00:24 +00:00
} else {
console . log ( ` [ ${ queryId } ] No JSON array found in response ` ) ;
2026-01-18 15:54:16 +00:00
}
}
2026-01-18 23:00:24 +00:00
console . log ( ` [ ${ queryId } ] PARSE FAILED ` ) ;
2026-01-18 15:54:16 +00:00
return [ ] ;
}
/ * *
2026-01-18 23:00:24 +00:00
* Extract transactions ( single pass )
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
async function extractTransactions ( markdown : string , docName : string ) : Promise < ITransaction [ ] > {
console . log ( ` [ ${ docName } ] Extracting... ` ) ;
const txs = await extractTransactionsFromMarkdown ( markdown , docName ) ;
console . log ( ` [ ${ docName } ] Extracted ${ txs . length } transactions ` ) ;
return txs ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-18 23:00:24 +00:00
* Compare transactions
2026-01-18 15:54:16 +00:00
* /
function compareTransactions (
extracted : ITransaction [ ] ,
expected : ITransaction [ ]
2026-01-18 23:00:24 +00:00
) : { matches : number ; total : number ; errors : string [ ] } {
2026-01-18 15:54:16 +00:00
const errors : string [ ] = [ ] ;
let matches = 0 ;
for ( let i = 0 ; i < expected . length ; i ++ ) {
const exp = expected [ i ] ;
const ext = extracted [ i ] ;
if ( ! ext ) {
2026-01-18 23:00:24 +00:00
errors . push ( ` Missing tx ${ i } : ${ exp . date } ${ exp . counterparty } ` ) ;
2026-01-18 15:54:16 +00:00
continue ;
}
const dateMatch = ext . date === exp . date ;
const amountMatch = Math . abs ( ext . amount - exp . amount ) < 0.01 ;
if ( dateMatch && amountMatch ) {
matches ++ ;
} else {
2026-01-18 23:00:24 +00:00
errors . push ( ` Mismatch ${ i } : exp ${ exp . date } / ${ exp . amount } , got ${ ext . date } / ${ ext . amount } ` ) ;
2026-01-18 15:54:16 +00:00
}
}
if ( extracted . length > expected . length ) {
errors . push ( ` Extra transactions: ${ extracted . length - expected . length } ` ) ;
}
2026-01-18 23:00:24 +00:00
return { matches , total : expected.length , errors } ;
2026-01-18 15:54:16 +00:00
}
/ * *
2026-01-18 23:00:24 +00:00
* Find all test cases
2026-01-18 15:54:16 +00:00
* /
2026-01-18 23:00:24 +00:00
function findTestCases ( ) : ITestCase [ ] {
2026-01-18 15:54:16 +00:00
const testDir = path . join ( process . cwd ( ) , '.nogit' ) ;
2026-01-18 23:00:24 +00:00
if ( ! fs . existsSync ( testDir ) ) return [ ] ;
2026-01-18 15:54:16 +00:00
const files = fs . readdirSync ( testDir ) ;
2026-01-18 23:00:24 +00:00
const testCases : ITestCase [ ] = [ ] ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
for ( const pdf of files . filter ( ( f : string ) = > f . endsWith ( '.pdf' ) ) ) {
2026-01-18 15:54:16 +00:00
const baseName = pdf . replace ( '.pdf' , '' ) ;
const jsonFile = ` ${ baseName } .json ` ;
if ( files . includes ( jsonFile ) ) {
testCases . push ( {
name : baseName ,
pdfPath : path.join ( testDir , pdf ) ,
jsonPath : path.join ( testDir , jsonFile ) ,
} ) ;
}
}
return testCases . sort ( ( a , b ) = > a . name . localeCompare ( b . name ) ) ;
}
2026-01-18 23:00:24 +00:00
// ============ TESTS ============
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
const testCases = findTestCases ( ) ;
console . log ( ` \ nFound ${ testCases . length } bank statement test cases \ n ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Ensure temp directory exists
if ( ! fs . existsSync ( TEMP_MD_DIR ) ) {
fs . mkdirSync ( TEMP_MD_DIR , { recursive : true } ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// -------- STAGE 1: OCR with Nanonets --------
// Check if all markdown files already exist
function allMarkdownFilesExist ( ) : boolean {
for ( const tc of testCases ) {
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
if ( ! fs . existsSync ( mdPath ) ) {
return false ;
}
}
return true ;
}
// Track whether we need to run Stage 1
let stage1Needed = ! allMarkdownFilesExist ( ) ;
tap . test ( 'Stage 1: Setup Nanonets' , async ( ) = > {
console . log ( '\n========== STAGE 1: Nanonets OCR ==========\n' ) ;
if ( ! stage1Needed ) {
console . log ( ' [SKIP] All markdown files already exist, skipping Nanonets setup' ) ;
return ;
}
const ok = await ensureNanonetsOcr ( ) ;
expect ( ok ) . toBeTrue ( ) ;
} ) ;
tap . test ( 'Stage 1: Convert all documents to markdown' , async ( ) = > {
if ( ! stage1Needed ) {
console . log ( ' [SKIP] Using existing markdown files from previous run\n' ) ;
// Load existing markdown paths
for ( const tc of testCases ) {
tc . markdownPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
console . log ( ` Loaded: ${ tc . markdownPath } ` ) ;
}
return ;
}
console . log ( '\n Converting all PDFs to markdown with Nanonets-OCR-s...\n' ) ;
for ( const tc of testCases ) {
console . log ( ` \ n === ${ tc . name } === ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Convert PDF to images
const images = convertPdfToImages ( tc . pdfPath ) ;
console . log ( ` Pages: ${ images . length } ` ) ;
// Convert to markdown
const markdown = await convertDocumentToMarkdown ( images , tc . name ) ;
// Save markdown to temp file
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
fs . writeFileSync ( mdPath , markdown ) ;
tc . markdownPath = mdPath ;
console . log ( ` Saved: ${ mdPath } ` ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
console . log ( '\n Stage 1 complete: All documents converted to markdown\n' ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
tap . test ( 'Stage 1: Stop Nanonets' , async ( ) = > {
if ( ! stage1Needed ) {
console . log ( ' [SKIP] Nanonets was not started' ) ;
return ;
}
stopNanonets ( ) ;
// Verify it's stopped
await new Promise ( resolve = > setTimeout ( resolve , 3000 ) ) ;
expect ( isContainerRunning ( 'nanonets-test' ) ) . toBeFalse ( ) ;
2026-01-18 15:54:16 +00:00
} ) ;
2026-01-18 23:00:24 +00:00
// -------- STAGE 2: Extraction with GPT-OSS 20B --------
tap . test ( 'Stage 2: Setup Ollama + GPT-OSS 20B' , async ( ) = > {
console . log ( '\n========== STAGE 2: GPT-OSS 20B Extraction ==========\n' ) ;
const ollamaOk = await ensureMiniCpm ( ) ;
expect ( ollamaOk ) . toBeTrue ( ) ;
const extractionOk = await ensureExtractionModel ( ) ;
expect ( extractionOk ) . toBeTrue ( ) ;
} ) ;
2026-01-18 15:54:16 +00:00
let passedCount = 0 ;
let failedCount = 0 ;
2026-01-18 23:00:24 +00:00
for ( const tc of testCases ) {
tap . test ( ` Stage 2: Extract ${ tc . name } ` , async ( ) = > {
const expected : ITransaction [ ] = JSON . parse ( fs . readFileSync ( tc . jsonPath , 'utf-8' ) ) ;
console . log ( ` \ n === ${ tc . name } === ` ) ;
console . log ( ` Expected: ${ expected . length } transactions ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Load saved markdown
const mdPath = path . join ( TEMP_MD_DIR , ` ${ tc . name } .md ` ) ;
if ( ! fs . existsSync ( mdPath ) ) {
throw new Error ( ` Markdown not found: ${ mdPath } . Run Stage 1 first. ` ) ;
}
const markdown = fs . readFileSync ( mdPath , 'utf-8' ) ;
console . log ( ` Markdown: ${ markdown . length } chars ` ) ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Extract transactions (single pass)
const extracted = await extractTransactions ( markdown , tc . name ) ;
// Log results
console . log ( ` Extracted: ${ extracted . length } transactions ` ) ;
for ( let i = 0 ; i < Math . min ( extracted . length , 5 ) ; i ++ ) {
const tx = extracted [ i ] ;
console . log ( ` ${ i + 1 } . ${ tx . date } | ${ tx . counterparty . substring ( 0 , 25 ) . padEnd ( 25 ) } | ${ tx . amount >= 0 ? '+' : '' } ${ tx . amount . toFixed ( 2 ) } ` ) ;
}
if ( extracted . length > 5 ) {
console . log ( ` ... and ${ extracted . length - 5 } more ` ) ;
}
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
// Compare
2026-01-18 15:54:16 +00:00
const result = compareTransactions ( extracted , expected ) ;
2026-01-18 23:00:24 +00:00
const pass = result . matches === result . total && extracted . length === expected . length ;
2026-01-18 15:54:16 +00:00
2026-01-18 23:00:24 +00:00
if ( pass ) {
2026-01-18 15:54:16 +00:00
passedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: PASS ( ${ result . matches } / ${ result . total } ) ` ) ;
2026-01-18 15:54:16 +00:00
} else {
failedCount ++ ;
2026-01-18 23:00:24 +00:00
console . log ( ` Result: FAIL ( ${ result . matches } / ${ result . total } ) ` ) ;
result . errors . slice ( 0 , 5 ) . forEach ( e = > console . log ( ` - ${ e } ` ) ) ;
2026-01-18 15:54:16 +00:00
}
expect ( result . matches ) . toEqual ( result . total ) ;
expect ( extracted . length ) . toEqual ( expected . length ) ;
} ) ;
}
2026-01-18 23:00:24 +00:00
tap . test ( 'Summary' , async ( ) = > {
2026-01-18 15:54:16 +00:00
console . log ( ` \ n====================================================== ` ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Bank Statement Summary (Nanonets + GPT-OSS 20B Sequential) ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` ====================================================== ` ) ;
console . log ( ` Stage 1: Nanonets-OCR-s (document -> markdown) ` ) ;
2026-01-18 23:00:24 +00:00
console . log ( ` Stage 2: GPT-OSS 20B (markdown -> JSON) ` ) ;
console . log ( ` Passed: ${ passedCount } / ${ testCases . length } ` ) ;
console . log ( ` Failed: ${ failedCount } / ${ testCases . length } ` ) ;
2026-01-18 15:54:16 +00:00
console . log ( ` ====================================================== \ n ` ) ;
2026-01-18 23:00:24 +00:00
// Only cleanup temp files if ALL tests passed
if ( failedCount === 0 && passedCount === testCases . length ) {
try {
fs . rmSync ( TEMP_MD_DIR , { recursive : true , force : true } ) ;
console . log ( ` Cleaned up temp directory: ${ TEMP_MD_DIR } \ n ` ) ;
} catch {
// Ignore
}
} else {
console . log ( ` Keeping temp directory for debugging: ${ TEMP_MD_DIR } \ n ` ) ;
}
2026-01-18 15:54:16 +00:00
} ) ;
export default tap . start ( ) ;