feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README

This commit is contained in:
2026-01-18 00:11:17 +00:00
parent 0d8a1ebac2
commit f0d88fcbe0
4 changed files with 486 additions and 82 deletions

View File

@@ -4,11 +4,13 @@
* This tests the complete PaddleOCR-VL pipeline:
* 1. PP-DocLayoutV2 for layout detection
* 2. PaddleOCR-VL for recognition
* 3. Structured Markdown output
* 4. MiniCPM extracts invoice fields from structured Markdown
* 3. Structured HTML output (semantic tags with proper tables)
* 4. Qwen2.5 extracts invoice fields from structured HTML
*
* The structured Markdown has proper tables and formatting,
* making it much easier for MiniCPM to extract invoice data.
* HTML output is used instead of Markdown because:
* - <table> tags are unambiguous (no parser variations)
* - LLMs are heavily trained on web/HTML data
* - Semantic tags (header, footer, section) provide clear structure
*/
import { tap, expect } from '@git.zone/tstest/tapbundle';
import * as fs from 'fs';
@@ -61,7 +63,7 @@ function convertPdfToImages(pdfPath: string): string[] {
}
/**
* Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
* Parse document using PaddleOCR-VL Full Pipeline (returns structured HTML)
*/
async function parseDocument(imageBase64: string): Promise<string> {
const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
@@ -69,7 +71,7 @@ async function parseDocument(imageBase64: string): Promise<string> {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
image: imageBase64,
output_format: 'markdown',
output_format: 'html',
}),
});
@@ -84,18 +86,25 @@ async function parseDocument(imageBase64: string): Promise<string> {
throw new Error(`PaddleOCR-VL error: ${data.error}`);
}
return data.result?.markdown || '';
return data.result?.html || '';
}
/**
* Extract invoice fields from structured Markdown using Qwen2.5 (text-only model)
* Extract invoice fields from structured HTML using Qwen2.5 (text-only model)
*/
async function extractInvoiceFromMarkdown(markdown: string): Promise<IInvoice> {
// Truncate if too long
const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown;
console.log(` [Extract] Processing ${truncated.length} chars of Markdown`);
async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
// Truncate if too long (HTML is more valuable per byte, allow more)
const truncated = html.length > 16000 ? html.slice(0, 16000) : html;
console.log(` [Extract] Processing ${truncated.length} chars of HTML`);
const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object.
const prompt = `You are an invoice data extractor. Extract the following fields from this HTML document (OCR output with semantic structure) and return ONLY a valid JSON object.
The HTML uses semantic tags:
- <table> with <thead>/<tbody> for structured tables (invoice line items, totals)
- <header> for document header (company info, invoice number)
- <footer> for document footer (payment terms, legal text)
- <section class="table-region"> for table regions
- data-type and data-y attributes indicate block type and vertical position
Required fields:
- invoice_number: The invoice/receipt/document number
@@ -115,8 +124,9 @@ Rules:
- Use 0 for missing numeric fields
- Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28")
- Extract numbers without currency symbols
- Look for totals in <table> sections, especially rows with "Total", "Amount Due", "Grand Total"
OCR Text:
HTML Document:
${truncated}
JSON:`;
@@ -195,12 +205,12 @@ JSON:`;
* Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only)
*/
async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
// Parse document with full pipeline (PaddleOCR-VL)
const markdown = await parseDocument(images[0]);
console.log(` [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
// Parse document with full pipeline (PaddleOCR-VL) -> returns HTML
const html = await parseDocument(images[0]);
console.log(` [Parse] Got ${html.split('\n').length} lines of HTML`);
// Extract invoice fields from Markdown using text-only model (no images)
return extractInvoiceFromMarkdown(markdown);
// Extract invoice fields from HTML using text-only model (no images)
return extractInvoiceFromHtml(html);
}
/**
@@ -438,7 +448,7 @@ tap.test('summary', async () => {
console.log(`\n======================================================`);
console.log(` Invoice Extraction Summary (PaddleOCR-VL Full)`);
console.log(`======================================================`);
console.log(` Method: PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`);
console.log(` Method: PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only)`);
console.log(` Passed: ${passedCount}/${totalInvoices}`);
console.log(` Failed: ${failedCount}/${totalInvoices}`);
console.log(` Accuracy: ${accuracy.toFixed(1)}%`);