feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README

2026-01-18 00:11:17 +00:00
parent 0d8a1ebac2
commit f0d88fcbe0
4 changed files with 486 additions and 82 deletions
--- a/test/test.invoices.paddleocr-vl.ts
+++ b/test/test.invoices.paddleocr-vl.ts
@@ -4,11 +4,13 @@
 * This tests the complete PaddleOCR-VL pipeline:
 *   1. PP-DocLayoutV2 for layout detection
 *   2. PaddleOCR-VL for recognition
- *   3. Structured Markdown output
- *   4. MiniCPM extracts invoice fields from structured Markdown
+ *   3. Structured HTML output (semantic tags with proper tables)
+ *   4. Qwen2.5 extracts invoice fields from structured HTML
 *
- * The structured Markdown has proper tables and formatting,
- * making it much easier for MiniCPM to extract invoice data.
+ * HTML output is used instead of Markdown because:
+ * - <table> tags are unambiguous (no parser variations)
+ * - LLMs are heavily trained on web/HTML data
+ * - Semantic tags (header, footer, section) provide clear structure
 */
 import { tap, expect } from '@git.zone/tstest/tapbundle';
 import * as fs from 'fs';
@@ -61,7 +63,7 @@ function convertPdfToImages(pdfPath: string): string[] {
 }

 /**
- * Parse document using PaddleOCR-VL Full Pipeline (returns structured Markdown)
+ * Parse document using PaddleOCR-VL Full Pipeline (returns structured HTML)
 */
 async function parseDocument(imageBase64: string): Promise<string> {
  const response = await fetch(`${PADDLEOCR_VL_URL}/parse`, {
@@ -69,7 +71,7 @@ async function parseDocument(imageBase64: string): Promise<string> {
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      image: imageBase64,
-      output_format: 'markdown',
+      output_format: 'html',
    }),
  });

@@ -84,18 +86,25 @@ async function parseDocument(imageBase64: string): Promise<string> {
    throw new Error(`PaddleOCR-VL error: ${data.error}`);
  }

-  return data.result?.markdown || '';
+  return data.result?.html || '';
 }

 /**
- * Extract invoice fields from structured Markdown using Qwen2.5 (text-only model)
+ * Extract invoice fields from structured HTML using Qwen2.5 (text-only model)
 */
-async function extractInvoiceFromMarkdown(markdown: string): Promise<IInvoice> {
-  // Truncate if too long
-  const truncated = markdown.length > 12000 ? markdown.slice(0, 12000) : markdown;
-  console.log(`    [Extract] Processing ${truncated.length} chars of Markdown`);
+async function extractInvoiceFromHtml(html: string): Promise<IInvoice> {
+  // Truncate if too long (HTML is more valuable per byte, allow more)
+  const truncated = html.length > 16000 ? html.slice(0, 16000) : html;
+  console.log(`    [Extract] Processing ${truncated.length} chars of HTML`);

-  const prompt = `You are an invoice data extractor. Extract the following fields from this OCR text and return ONLY a valid JSON object.
+  const prompt = `You are an invoice data extractor. Extract the following fields from this HTML document (OCR output with semantic structure) and return ONLY a valid JSON object.
+
+The HTML uses semantic tags:
+- <table> with <thead>/<tbody> for structured tables (invoice line items, totals)
+- <header> for document header (company info, invoice number)
+- <footer> for document footer (payment terms, legal text)
+- <section class="table-region"> for table regions
+- data-type and data-y attributes indicate block type and vertical position

 Required fields:
 - invoice_number: The invoice/receipt/document number
@@ -115,8 +124,9 @@ Rules:
 - Use 0 for missing numeric fields
 - Convert dates to YYYY-MM-DD format (e.g., "28-JAN-2022" becomes "2022-01-28")
 - Extract numbers without currency symbols
+- Look for totals in <table> sections, especially rows with "Total", "Amount Due", "Grand Total"

-OCR Text:
+HTML Document:
 ${truncated}

 JSON:`;
@@ -195,12 +205,12 @@ JSON:`;
 * Single extraction pass: Parse with PaddleOCR-VL Full, extract with Qwen2.5 (text-only)
 */
 async function extractOnce(images: string[], passNum: number): Promise<IInvoice> {
-  // Parse document with full pipeline (PaddleOCR-VL)
-  const markdown = await parseDocument(images[0]);
-  console.log(`    [Parse] Got ${markdown.split('\n').length} lines of Markdown`);
+  // Parse document with full pipeline (PaddleOCR-VL) -> returns HTML
+  const html = await parseDocument(images[0]);
+  console.log(`    [Parse] Got ${html.split('\n').length} lines of HTML`);

-  // Extract invoice fields from Markdown using text-only model (no images)
-  return extractInvoiceFromMarkdown(markdown);
+  // Extract invoice fields from HTML using text-only model (no images)
+  return extractInvoiceFromHtml(html);
 }

 /**
@@ -438,7 +448,7 @@ tap.test('summary', async () => {
  console.log(`\n======================================================`);
  console.log(`   Invoice Extraction Summary (PaddleOCR-VL Full)`);
  console.log(`======================================================`);
-  console.log(`  Method:    PaddleOCR-VL Full Pipeline -> Qwen2.5 (text-only)`);
+  console.log(`  Method:    PaddleOCR-VL Full Pipeline (HTML) -> Qwen2.5 (text-only)`);
  console.log(`  Passed:    ${passedCount}/${totalInvoices}`);
  console.log(`  Failed:    ${failedCount}/${totalInvoices}`);
  console.log(`  Accuracy:  ${accuracy.toFixed(1)}%`);