feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README
This commit is contained in:
@@ -4,11 +4,13 @@
|
||||
* This tests the complete PaddleOCR-VL pipeline:
|
||||
* 1. PP-DocLayoutV2 for layout detection
|
||||
* 2. PaddleOCR-VL for recognition
|
||||
* 3. Structured Markdown output
|
||||
* 4. MiniCPM extracts invoice fields from structured Markdown
|
||||
* 3. Structured HTML output (semantic tags with proper tables)
|
||||
* 4. Qwen2.5 extracts invoice fields from structured HTML
|
||||
*
|
||||
* The structured Markdown has proper tables and formatting,
|
||||
* making it much easier for MiniCPM to extract invoice data.
|
||||
* HTML output is used instead of Markdown because:
|
||||
* - <table> tags are unambiguous (no parser variations)
|
||||
* - LLMs are heavily trained on web/HTML data
|
||||
* - Semantic tags (header, footer, section) provide clear structure
|
||||
*/
|
||||
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
||||
import * as fs from 'fs';
|
||||
@@ -61,7 +63,7 @@ function convertPdfToImages(pdfPath: string): string[] {
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
|
||||
* Parse document using PaddleOCR-VL Full Pipeline (returns structured HTML)
|
||||
*/
|
||||
async function parseDocument(imageBase64: string): Promise<string> {
|
||||
const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
|
||||
@@ -69,7 +71,7 @@ async function parseDocument(imageBase64: string): Promise<string> {
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
image: imageBase64,
|
||||
output_format: 'markdown',
|
||||
output_format: 'html',
|
||||
}),
|
||||
});
|
||||
|
||||
@@ -84,18 +86,25 @@ async function parseDocument(imageBase64: string): Promise<string> {
|
||||
throw new Error(`PaddleOCR-VL error: ${data.error}`);
|
||||
}
|
||||
|
||||
return data.result?.markdown || '';
|
||||
return data.result?.html || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract invoice fields from structured Markdown using Qwen2.5 (text-only model)
|
||||
* Extract invoice fields from structured HTML using Qwen2.5 (text-only model)
|
||||
*/
|
||||
async function extractInvoiceFromMarkdown(markdown: string): Promise<IInvoice> {
|
||||
// Truncate if too long
|
||||
const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown;
|
||||
console.log(` [Extract] Processing ${truncated.length} chars of Markdown`);
|
||||
async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
|
||||
// Truncate if too long (HTML is more valuable per byte, allow more)
|
||||
const truncated = html.length > 16000 ? html.slice(0, 16000) : html;
|
||||
console.log(` [Extract] Processing ${truncated.length} chars of HTML`);
|
||||
|
||||
const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object.
|
||||
const prompt = `You are an invoice data extractor. Extract the following fields from this HTML document (OCR output with semantic structure) and return ONLY a valid JSON object.
|
||||
|
||||
The HTML uses semantic tags:
|
||||
- <table> with <thead>/<tbody> for structured tables (invoice line items, totals)
|
||||
- <header> for document header (company info, invoice number)
|
||||
- <footer> for document footer (payment terms, legal text)
|
||||
- <section class="table-region"> for table regions
|
||||
- data-type and data-y attributes indicate block type and vertical position
|
||||
|
||||
Required fields:
|
||||
- invoice_number: The invoice/receipt/document number
|
||||
@@ -115,8 +124,9 @@ Rules:
|
||||
- Use 0 for missing numeric fields
|
||||
- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28")
|
||||
- Extract numbers without currency symbols
|
||||
- Look for totals in <table> sections, especially rows with "Total", "Amount Due", "Grand Total"
|
||||
|
||||
OCR Text:
|
||||
HTML Document:
|
||||
${truncated}
|
||||
|
||||
JSON:`;
|
||||
@@ -195,12 +205,12 @@ JSON:`;
|
||||
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only)
|
||||
*/
|
||||
async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
|
||||
// Parse document with full pipeline (PaddleOCR-VL)
|
||||
const markdown = await parseDocument(images[0]);
|
||||
console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
|
||||
// Parse document with full pipeline (PaddleOCR-VL) -> returns HTML
|
||||
const html = await parseDocument(images[0]);
|
||||
console.log(` [Parse] Got ${html.split('\n').length} lines of HTML`);
|
||||
|
||||
// Extract invoice fields from Markdown using text-only model (no images)
|
||||
return extractInvoiceFromMarkdown(markdown);
|
||||
// Extract invoice fields from HTML using text-only model (no images)
|
||||
return extractInvoiceFromHtml(html);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -438,7 +448,7 @@ tap.test('summary', async () => {
|
||||
console.log(`\n======================================================`);
|
||||
console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`);
|
||||
console.log(`======================================================`);
|
||||
console.log(` Method: PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`);
|
||||
console.log(` Method: PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only)`);
|
||||
console.log(` Passed: ${passedCount}/${totalInvoices}`);
|
||||
console.log(` Failed: ${failedCount}/${totalInvoices}`);
|
||||
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);
|
||||
|
||||
Reference in New Issue
Block a user