2026-01-17 20:22:23 +00:00
/ * *
* Invoice extraction test using PaddleOCR - VL Full Pipeline
*
* This tests the complete PaddleOCR - VL pipeline :
* 1 . PP - DocLayoutV2 for layout detection
* 2 . PaddleOCR - VL for recognition
2026-01-18 00:11:17 +00:00
* 3 . Structured HTML output ( semantic tags with proper tables )
* 4 . Qwen2 . 5 extracts invoice fields from structured HTML
2026-01-17 20:22:23 +00:00
*
2026-01-18 00:11:17 +00:00
* HTML output is used instead of Markdown because :
* - < table > tags are unambiguous ( no parser variations )
* - LLMs are heavily trained on web / HTML data
* - Semantic tags ( header , footer , section ) provide clear structure
2026-01-17 20:22:23 +00:00
* /
import { tap , expect } from '@git.zone/tstest/tapbundle' ;
import * as fs from 'fs' ;
import * as path from 'path' ;
import { execSync } from 'child_process' ;
import * as os from 'os' ;
2026-01-17 21:50:09 +00:00
import { ensurePaddleOcrVlFull , ensureQwen25 } from './helpers/docker.js' ;
2026-01-17 20:22:23 +00:00
const PADDLEOCR_VL_URL = 'http://localhost:8000' ;
const OLLAMA_URL = 'http://localhost:11434' ;
2026-01-17 21:50:09 +00:00
// Use Qwen2.5 for text-only JSON extraction (not MiniCPM which is vision-focused)
const TEXT_MODEL = 'qwen2.5:7b' ;
2026-01-17 20:22:23 +00:00
interface IInvoice {
invoice_number : string ;
invoice_date : string ;
vendor_name : string ;
currency : string ;
net_amount : number ;
vat_amount : number ;
total_amount : number ;
}
/ * *
* Convert PDF to PNG images using ImageMagick
* /
function convertPdfToImages ( pdfPath : string ) : string [ ] {
const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'pdf-convert-' ) ) ;
const outputPattern = path . join ( tempDir , 'page-%d.png' ) ;
try {
execSync (
` convert -density 200 -quality 90 " ${ pdfPath } " -background white -alpha remove " ${ outputPattern } " ` ,
{ stdio : 'pipe' }
) ;
const files = fs . readdirSync ( tempDir ) . filter ( ( f ) = > f . endsWith ( '.png' ) ) . sort ( ) ;
const images : string [ ] = [ ] ;
for ( const file of files ) {
const imagePath = path . join ( tempDir , file ) ;
const imageData = fs . readFileSync ( imagePath ) ;
images . push ( imageData . toString ( 'base64' ) ) ;
}
return images ;
} finally {
fs . rmSync ( tempDir , { recursive : true , force : true } ) ;
}
}
/ * *
2026-01-18 00:11:17 +00:00
* Parse document using PaddleOCR - VL Full Pipeline ( returns structured HTML )
2026-01-17 20:22:23 +00:00
* /
async function parseDocument ( imageBase64 : string ) : Promise < string > {
const response = await fetch ( ` ${ PADDLEOCR_VL_URL } /parse ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( {
image : imageBase64 ,
2026-01-18 00:11:17 +00:00
output_format : 'html' ,
2026-01-17 20:22:23 +00:00
} ) ,
} ) ;
if ( ! response . ok ) {
const text = await response . text ( ) ;
throw new Error ( ` PaddleOCR-VL API error: ${ response . status } - ${ text } ` ) ;
}
const data = await response . json ( ) ;
if ( ! data . success ) {
throw new Error ( ` PaddleOCR-VL error: ${ data . error } ` ) ;
}
2026-01-18 00:11:17 +00:00
return data . result ? . html || '' ;
2026-01-17 20:22:23 +00:00
}
/ * *
2026-01-18 00:11:17 +00:00
* Extract invoice fields from structured HTML using Qwen2 . 5 ( text - only model )
2026-01-17 20:22:23 +00:00
* /
2026-01-18 00:11:17 +00:00
async function extractInvoiceFromHtml ( html : string ) : Promise < IInvoice > {
// Truncate if too long (HTML is more valuable per byte, allow more)
const truncated = html . length > 16000 ? html . slice ( 0 , 16000 ) : html ;
console . log ( ` [Extract] Processing ${ truncated . length } chars of HTML ` ) ;
2026-01-17 20:22:23 +00:00
2026-01-18 00:11:17 +00:00
const prompt = ` You are an invoice data extractor. Extract the following fields from this HTML document (OCR output with semantic structure) and return ONLY a valid JSON object.
The HTML uses semantic tags :
- < table > with < thead > / < tbody > for structured tables ( invoice line items , totals )
- < header > for document header ( company info , invoice number )
- < footer > for document footer ( payment terms , legal text )
- < section class = "table-region" > for table regions
- data - type and data - y attributes indicate block type and vertical position
2026-01-17 20:22:23 +00:00
Required fields :
2026-01-17 21:50:09 +00:00
- invoice_number : The invoice / receipt / document number
- invoice_date : Date in YYYY - MM - DD format ( convert from any format )
2026-01-17 20:22:23 +00:00
- vendor_name : Company that issued the invoice
2026-01-17 21:50:09 +00:00
- currency : EUR , USD , GBP , etc .
- net_amount : Amount before tax ( number )
- vat_amount : Tax / VAT amount ( number , use 0 if reverse charge or not shown )
- total_amount : Final total amount ( number )
Example output format :
{ "invoice_number" : "INV-123" , "invoice_date" : "2022-01-28" , "vendor_name" : "Adobe" , "currency" : "EUR" , "net_amount" : 24.99 , "vat_amount" : 0 , "total_amount" : 24.99 }
Rules :
- Return ONLY the JSON object , no explanation or markdown
- Use null for missing string fields
- Use 0 for missing numeric fields
- Convert dates to YYYY - MM - DD format ( e . g . , "28-JAN-2022" becomes "2022-01-28" )
- Extract numbers without currency symbols
2026-01-18 00:11:17 +00:00
- Look for totals in < table > sections , especially rows with "Total" , "Amount Due" , "Grand Total"
2026-01-17 21:50:09 +00:00
2026-01-18 00:11:17 +00:00
HTML Document :
2026-01-17 20:22:23 +00:00
$ { truncated }
2026-01-17 21:50:09 +00:00
JSON : ` ;
2026-01-17 20:22:23 +00:00
const payload = {
2026-01-17 21:50:09 +00:00
model : TEXT_MODEL ,
2026-01-17 20:22:23 +00:00
prompt ,
stream : true ,
options : {
2026-01-17 21:50:09 +00:00
num_predict : 512 ,
2026-01-17 20:22:23 +00:00
temperature : 0.1 ,
} ,
} ;
const response = await fetch ( ` ${ OLLAMA_URL } /api/generate ` , {
method : 'POST' ,
headers : { 'Content-Type' : 'application/json' } ,
body : JSON.stringify ( payload ) ,
} ) ;
if ( ! response . ok ) {
throw new Error ( ` Ollama API error: ${ response . status } ` ) ;
}
const reader = response . body ? . getReader ( ) ;
if ( ! reader ) {
throw new Error ( 'No response body' ) ;
}
const decoder = new TextDecoder ( ) ;
let fullText = '' ;
while ( true ) {
const { done , value } = await reader . read ( ) ;
if ( done ) break ;
const chunk = decoder . decode ( value , { stream : true } ) ;
const lines = chunk . split ( '\n' ) . filter ( ( l ) = > l . trim ( ) ) ;
for ( const line of lines ) {
try {
const json = JSON . parse ( line ) ;
if ( json . response ) {
fullText += json . response ;
}
} catch {
// Skip invalid JSON lines
}
}
}
// Extract JSON from response
const startIdx = fullText . indexOf ( '{' ) ;
const endIdx = fullText . lastIndexOf ( '}' ) + 1 ;
if ( startIdx < 0 || endIdx <= startIdx ) {
throw new Error ( ` No JSON object found in response: ${ fullText . substring ( 0 , 200 ) } ` ) ;
}
const jsonStr = fullText . substring ( startIdx , endIdx ) ;
2026-01-17 21:50:09 +00:00
const parsed = JSON . parse ( jsonStr ) ;
// Ensure numeric fields are actually numbers
return {
invoice_number : parsed.invoice_number || null ,
invoice_date : parsed.invoice_date || null ,
vendor_name : parsed.vendor_name || null ,
currency : parsed.currency || 'EUR' ,
net_amount : parseFloat ( parsed . net_amount ) || 0 ,
vat_amount : parseFloat ( parsed . vat_amount ) || 0 ,
total_amount : parseFloat ( parsed . total_amount ) || 0 ,
} ;
2026-01-17 20:22:23 +00:00
}
/ * *
2026-01-17 21:50:09 +00:00
* Single extraction pass : Parse with PaddleOCR - VL Full , extract with Qwen2 . 5 ( text - only )
2026-01-17 20:22:23 +00:00
* /
async function extractOnce ( images : string [ ] , passNum : number ) : Promise < IInvoice > {
2026-01-18 00:11:17 +00:00
// Parse document with full pipeline (PaddleOCR-VL) -> returns HTML
const html = await parseDocument ( images [ 0 ] ) ;
console . log ( ` [Parse] Got ${ html . split ( '\n' ) . length } lines of HTML ` ) ;
2026-01-17 20:22:23 +00:00
2026-01-18 00:11:17 +00:00
// Extract invoice fields from HTML using text-only model (no images)
return extractInvoiceFromHtml ( html ) ;
2026-01-17 20:22:23 +00:00
}
/ * *
* Create a hash of invoice for comparison ( using key fields )
* /
function hashInvoice ( invoice : IInvoice ) : string {
2026-01-17 21:50:09 +00:00
// Ensure total_amount is a number
const amount = typeof invoice . total_amount === 'number'
? invoice . total_amount . toFixed ( 2 )
: String ( invoice . total_amount || 0 ) ;
return ` ${ invoice . invoice_number } | ${ invoice . invoice_date } | ${ amount } ` ;
2026-01-17 20:22:23 +00:00
}
/ * *
* Extract with consensus voting
* /
async function extractWithConsensus ( images : string [ ] , invoiceName : string , maxPasses : number = 5 ) : Promise < IInvoice > {
const results : Array < { invoice : IInvoice ; hash : string } > = [ ] ;
const hashCounts : Map < string , number > = new Map ( ) ;
const addResult = ( invoice : IInvoice , passLabel : string ) : number = > {
const hash = hashInvoice ( invoice ) ;
results . push ( { invoice , hash } ) ;
hashCounts . set ( hash , ( hashCounts . get ( hash ) || 0 ) + 1 ) ;
console . log ( ` [ ${ passLabel } ] ${ invoice . invoice_number } | ${ invoice . invoice_date } | ${ invoice . total_amount } ${ invoice . currency } ` ) ;
return hashCounts . get ( hash ) ! ;
} ;
for ( let pass = 1 ; pass <= maxPasses ; pass ++ ) {
try {
const invoice = await extractOnce ( images , pass ) ;
const count = addResult ( invoice , ` Pass ${ pass } ` ) ;
if ( count >= 2 ) {
console . log ( ` [Consensus] Reached after ${ pass } passes ` ) ;
return invoice ;
}
} catch ( err ) {
console . log ( ` [Pass ${ pass } ] Error: ${ err } ` ) ;
}
}
// No consensus reached - return the most common result
let bestHash = '' ;
let bestCount = 0 ;
for ( const [ hash , count ] of hashCounts ) {
if ( count > bestCount ) {
bestCount = count ;
bestHash = hash ;
}
}
if ( ! bestHash ) {
throw new Error ( ` No valid results for ${ invoiceName } ` ) ;
}
const best = results . find ( ( r ) = > r . hash === bestHash ) ! ;
console . log ( ` [No consensus] Using most common result ( ${ bestCount } / ${ maxPasses } passes) ` ) ;
return best . invoice ;
}
2026-01-17 21:50:09 +00:00
/ * *
* Normalize date to YYYY - MM - DD format
* /
function normalizeDate ( dateStr : string | null ) : string {
if ( ! dateStr ) return '' ;
// Already in correct format
if ( /^\d{4}-\d{2}-\d{2}$/ . test ( dateStr ) ) {
return dateStr ;
}
// Handle DD-MMM-YYYY format (e.g., "28-JUN-2022")
const monthMap : Record < string , string > = {
JAN : '01' , FEB : '02' , MAR : '03' , APR : '04' , MAY : '05' , JUN : '06' ,
JUL : '07' , AUG : '08' , SEP : '09' , OCT : '10' , NOV : '11' , DEC : '12' ,
} ;
const match = dateStr . match ( /^(\d{1,2})-([A-Z]{3})-(\d{4})$/i ) ;
if ( match ) {
const day = match [ 1 ] . padStart ( 2 , '0' ) ;
const month = monthMap [ match [ 2 ] . toUpperCase ( ) ] || '01' ;
const year = match [ 3 ] ;
return ` ${ year } - ${ month } - ${ day } ` ;
}
// Handle DD/MM/YYYY or DD.MM.YYYY
const match2 = dateStr . match ( /^(\d{1,2})[\/.](\d{1,2})[\/.](\d{4})$/ ) ;
if ( match2 ) {
const day = match2 [ 1 ] . padStart ( 2 , '0' ) ;
const month = match2 [ 2 ] . padStart ( 2 , '0' ) ;
const year = match2 [ 3 ] ;
return ` ${ year } - ${ month } - ${ day } ` ;
}
return dateStr ;
}
2026-01-17 20:22:23 +00:00
/ * *
* Compare extracted invoice against expected
* /
function compareInvoice (
extracted : IInvoice ,
expected : IInvoice
) : { match : boolean ; errors : string [ ] } {
const errors : string [ ] = [ ] ;
// Compare invoice number (normalize by removing spaces and case)
const extNum = extracted . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
const expNum = expected . invoice_number ? . replace ( /\s/g , '' ) . toLowerCase ( ) || '' ;
if ( extNum !== expNum ) {
errors . push ( ` invoice_number: expected " ${ expected . invoice_number } ", got " ${ extracted . invoice_number } " ` ) ;
}
2026-01-17 21:50:09 +00:00
// Compare date (normalize format first)
const extDate = normalizeDate ( extracted . invoice_date ) ;
const expDate = normalizeDate ( expected . invoice_date ) ;
if ( extDate !== expDate ) {
2026-01-17 20:22:23 +00:00
errors . push ( ` invoice_date: expected " ${ expected . invoice_date } ", got " ${ extracted . invoice_date } " ` ) ;
}
// Compare total amount (with tolerance)
if ( Math . abs ( extracted . total_amount - expected . total_amount ) > 0.02 ) {
errors . push ( ` total_amount: expected ${ expected . total_amount } , got ${ extracted . total_amount } ` ) ;
}
// Compare currency
if ( extracted . currency ? . toUpperCase ( ) !== expected . currency ? . toUpperCase ( ) ) {
errors . push ( ` currency: expected " ${ expected . currency } ", got " ${ extracted . currency } " ` ) ;
}
return { match : errors.length === 0 , errors } ;
}
/ * *
* Find all test cases ( PDF + JSON pairs ) in . nogit / invoices /
* /
function findTestCases ( ) : Array < { name : string ; pdfPath : string ; jsonPath : string } > {
const testDir = path . join ( process . cwd ( ) , '.nogit/invoices' ) ;
if ( ! fs . existsSync ( testDir ) ) {
return [ ] ;
}
const files = fs . readdirSync ( testDir ) ;
const pdfFiles = files . filter ( ( f ) = > f . endsWith ( '.pdf' ) ) ;
const testCases : Array < { name : string ; pdfPath : string ; jsonPath : string } > = [ ] ;
for ( const pdf of pdfFiles ) {
const baseName = pdf . replace ( '.pdf' , '' ) ;
const jsonFile = ` ${ baseName } .json ` ;
if ( files . includes ( jsonFile ) ) {
testCases . push ( {
name : baseName ,
pdfPath : path.join ( testDir , pdf ) ,
jsonPath : path.join ( testDir , jsonFile ) ,
} ) ;
}
}
// Sort alphabetically
testCases . sort ( ( a , b ) = > a . name . localeCompare ( b . name ) ) ;
return testCases ;
}
// Tests
tap . test ( 'setup: ensure Docker containers are running' , async ( ) = > {
console . log ( '\n[Setup] Checking Docker containers...\n' ) ;
// Ensure PaddleOCR-VL Full Pipeline is running
const paddleOk = await ensurePaddleOcrVlFull ( ) ;
expect ( paddleOk ) . toBeTrue ( ) ;
2026-01-17 21:50:09 +00:00
// Ensure Qwen2.5 is available (for text-only JSON extraction)
const qwenOk = await ensureQwen25 ( ) ;
expect ( qwenOk ) . toBeTrue ( ) ;
2026-01-17 20:22:23 +00:00
console . log ( '\n[Setup] All containers ready!\n' ) ;
} ) ;
// Dynamic test for each PDF/JSON pair
const testCases = findTestCases ( ) ;
console . log ( ` \ nFound ${ testCases . length } invoice test cases (PaddleOCR-VL Full Pipeline) \ n ` ) ;
let passedCount = 0 ;
let failedCount = 0 ;
const processingTimes : number [ ] = [ ] ;
for ( const testCase of testCases ) {
tap . test ( ` should extract invoice: ${ testCase . name } ` , async ( ) = > {
// Load expected data
const expected : IInvoice = JSON . parse ( fs . readFileSync ( testCase . jsonPath , 'utf-8' ) ) ;
console . log ( ` \ n=== ${ testCase . name } === ` ) ;
console . log ( ` Expected: ${ expected . invoice_number } | ${ expected . invoice_date } | ${ expected . total_amount } ${ expected . currency } ` ) ;
const startTime = Date . now ( ) ;
// Convert PDF to images
const images = convertPdfToImages ( testCase . pdfPath ) ;
console . log ( ` Pages: ${ images . length } ` ) ;
// Extract with consensus voting (PaddleOCR-VL Full -> MiniCPM)
const extracted = await extractWithConsensus ( images , testCase . name ) ;
const endTime = Date . now ( ) ;
const elapsedMs = endTime - startTime ;
processingTimes . push ( elapsedMs ) ;
// Compare results
const result = compareInvoice ( extracted , expected ) ;
if ( result . match ) {
passedCount ++ ;
console . log ( ` Result: MATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
} else {
failedCount ++ ;
console . log ( ` Result: MISMATCH ( ${ ( elapsedMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
result . errors . forEach ( ( e ) = > console . log ( ` - ${ e } ` ) ) ;
}
// Assert match
expect ( result . match ) . toBeTrue ( ) ;
} ) ;
}
tap . test ( 'summary' , async ( ) = > {
const totalInvoices = testCases . length ;
const accuracy = totalInvoices > 0 ? ( passedCount / totalInvoices ) * 100 : 0 ;
const totalTimeMs = processingTimes . reduce ( ( a , b ) = > a + b , 0 ) ;
const avgTimeMs = processingTimes . length > 0 ? totalTimeMs / processingTimes.length : 0 ;
const avgTimeSec = avgTimeMs / 1000 ;
const totalTimeSec = totalTimeMs / 1000 ;
console . log ( ` \ n====================================================== ` ) ;
console . log ( ` Invoice Extraction Summary (PaddleOCR-VL Full) ` ) ;
console . log ( ` ====================================================== ` ) ;
2026-01-18 00:11:17 +00:00
console . log ( ` Method: PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only) ` ) ;
2026-01-17 20:22:23 +00:00
console . log ( ` Passed: ${ passedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Failed: ${ failedCount } / ${ totalInvoices } ` ) ;
console . log ( ` Accuracy: ${ accuracy . toFixed ( 1 ) } % ` ) ;
console . log ( ` ------------------------------------------------------ ` ) ;
console . log ( ` Total time: ${ totalTimeSec . toFixed ( 1 ) } s ` ) ;
console . log ( ` Avg per inv: ${ avgTimeSec . toFixed ( 1 ) } s ` ) ;
console . log ( ` ====================================================== \ n ` ) ;
} ) ;
export default tap . start ( ) ;