feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README

2026-01-18 00:11:17 +00:00
parent 0d8a1ebac2
commit f0d88fcbe0
4 changed files with 486 additions and 82 deletions
--- a/image_support_files/paddleocr_vl_full_server.py
+++ b/image_support_files/paddleocr_vl_full_server.py
@@ -10,6 +10,7 @@ Provides REST API for document parsing using:

 import os
 import io
+import re
 import base64
 import logging
 import tempfile
@@ -261,23 +262,210 @@ def process_document(image: Image.Image) -> dict:


 def result_to_markdown(result: dict) -> str:
-    """Convert result to Markdown format"""
+    """Convert result to Markdown format with structural hints for LLM processing.
+
+    Adds positional and type-based formatting to help downstream LLMs
+    understand document structure:
+    - Tables are marked with **[TABLE]** prefix
+    - Header zone content (top 15%) is bolded
+    - Footer zone content (bottom 15%) is separated with horizontal rule
+    - Titles are formatted as # headers
+    - Figures/charts are marked with *[Figure: ...]*
+    """
    lines = []
+    image_height = result.get("image_size", [0, 1000])[1]

    for block in result.get("blocks", []):
-        block_type = block.get("type", "text")
-        content = block.get("content", "")
+        block_type = block.get("type", "text").lower()
+        content = block.get("content", "").strip()
+        bbox = block.get("bbox", [])

-        if "table" in block_type.lower():
-            lines.append(f"\n{content}\n")
-        elif "formula" in block_type.lower():
+        if not content:
+            continue
+
+        # Determine position zone (top 15%, middle, bottom 15%)
+        y_pos = bbox[1] if bbox and len(bbox) > 1 else 0
+        y_end = bbox[3] if bbox and len(bbox) > 3 else y_pos
+        is_header_zone = y_pos < image_height * 0.15
+        is_footer_zone = y_end > image_height * 0.85
+
+        # Format based on type and position
+        if "table" in block_type:
+            lines.append(f"\n**[TABLE]**\n{content}\n")
+        elif "title" in block_type:
+            lines.append(f"# {content}")
+        elif "formula" in block_type or "math" in block_type:
            lines.append(f"\n$$\n{content}\n$$\n")
+        elif "figure" in block_type or "chart" in block_type:
+            lines.append(f"*[Figure: {content}]*")
+        elif is_header_zone:
+            lines.append(f"**{content}**")
+        elif is_footer_zone:
+            lines.append(f"---\n{content}")
        else:
            lines.append(content)

    return "\n\n".join(lines)


+def parse_markdown_table(content: str) -> str:
+    """Convert table content to HTML table.
+
+    Handles:
+    - PaddleOCR-VL format: <fcel>cell<lcel>cell<nl> (detected by <fcel> tags)
+    - Pipe-delimited tables: | Header | Header |
+    - Separator rows: |---|---|
+    - Returns HTML <table> structure
+    """
+    content_stripped = content.strip()
+
+    # Check for PaddleOCR-VL table format (<fcel>, <lcel>, <ecel>, <nl>)
+    if '<fcel>' in content_stripped or '<nl>' in content_stripped:
+        return parse_paddleocr_table(content_stripped)
+
+    lines = content_stripped.split('\n')
+    if not lines:
+        return f'<pre>{content}</pre>'
+
+    # Check if it looks like a markdown table
+    if not any('|' in line for line in lines):
+        return f'<pre>{content}</pre>'
+
+    html_rows = []
+    is_header = True
+
+    for line in lines:
+        line = line.strip()
+        if not line or line.startswith('|') == False and '|' not in line:
+            continue
+
+        # Skip separator rows (|---|---|)
+        if re.match(r'^[\|\s\-:]+$', line):
+            is_header = False
+            continue
+
+        # Parse cells
+        cells = [c.strip() for c in line.split('|')]
+        cells = [c for c in cells if c]  # Remove empty from edges
+
+        if is_header:
+            row = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
+            html_rows.append(f'<thead>{row}</thead>')
+            is_header = False
+        else:
+            row = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
+            html_rows.append(row)
+
+    if html_rows:
+        # Wrap body rows in tbody
+        header = html_rows[0] if '<thead>' in html_rows[0] else ''
+        body_rows = [r for r in html_rows if '<thead>' not in r]
+        body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
+        return f'<table>{header}{body}</table>'
+
+    return f'<pre>{content}</pre>'
+
+
+def parse_paddleocr_table(content: str) -> str:
+    """Convert PaddleOCR-VL table format to HTML table.
+
+    PaddleOCR-VL uses:
+    - <fcel> = first cell in a row
+    - <lcel> = subsequent cells
+    - <ecel> = empty cell
+    - <nl> = row separator (newline)
+
+    Example input:
+    <fcel>Header1<lcel>Header2<nl><fcel>Value1<lcel>Value2<nl>
+    """
+    # Split into rows by <nl>
+    rows_raw = re.split(r'<nl>', content)
+    html_rows = []
+    is_first_row = True
+
+    for row_content in rows_raw:
+        row_content = row_content.strip()
+        if not row_content:
+            continue
+
+        # Extract cells: split by <fcel>, <lcel>, or <ecel>
+        # Each cell is the text between these markers
+        cells = []
+
+        # Pattern to match cell markers and capture content
+        # Content is everything between markers
+        parts = re.split(r'<fcel>|<lcel>|<ecel>', row_content)
+        for part in parts:
+            part = part.strip()
+            if part:
+                cells.append(part)
+
+        if not cells:
+            continue
+
+        # First row is header
+        if is_first_row:
+            row_html = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
+            html_rows.append(f'<thead>{row_html}</thead>')
+            is_first_row = False
+        else:
+            row_html = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
+            html_rows.append(row_html)
+
+    if html_rows:
+        header = html_rows[0] if '<thead>' in html_rows[0] else ''
+        body_rows = [r for r in html_rows if '<thead>' not in r]
+        body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
+        return f'<table>{header}{body}</table>'
+
+    return f'<pre>{content}</pre>'
+
+
+def result_to_html(result: dict) -> str:
+    """Convert result to semantic HTML for optimal LLM processing.
+
+    Uses semantic HTML5 tags with position metadata as data-* attributes.
+    Markdown tables are converted to proper HTML <table> tags for
+    unambiguous parsing by downstream LLMs.
+    """
+    parts = []
+    image_height = result.get("image_size", [0, 1000])[1]
+
+    parts.append('<!DOCTYPE html><html><body>')
+
+    for block in result.get("blocks", []):
+        block_type = block.get("type", "text").lower()
+        content = block.get("content", "").strip()
+        bbox = block.get("bbox", [])
+
+        if not content:
+            continue
+
+        # Position metadata
+        y_pos = bbox[1] / image_height if bbox and len(bbox) > 1 else 0
+        data_attrs = f'data-type="{block_type}" data-y="{y_pos:.2f}"'
+
+        # Format based on type
+        if "table" in block_type:
+            table_html = parse_markdown_table(content)
+            parts.append(f'<section {data_attrs} class="table-region">{table_html}</section>')
+        elif "title" in block_type:
+            parts.append(f'<h1 {data_attrs}>{content}</h1>')
+        elif "formula" in block_type or "math" in block_type:
+            parts.append(f'<div {data_attrs} class="formula"><code>{content}</code></div>')
+        elif "figure" in block_type or "chart" in block_type:
+            parts.append(f'<figure {data_attrs}><figcaption>{content}</figcaption></figure>')
+        elif y_pos < 0.15:
+            parts.append(f'<header {data_attrs}><strong>{content}</strong></header>')
+        elif y_pos > 0.85:
+            parts.append(f'<footer {data_attrs}>{content}</footer>')
+        else:
+            parts.append(f'<p {data_attrs}>{content}</p>')
+
+    parts.append('</body></html>')
+    return '\n'.join(parts)
+
+
 # Request/Response models
 class ParseRequest(BaseModel):
    image: str  # base64 encoded image
@@ -331,7 +519,7 @@ async def health_check():
 async def supported_formats():
    """List supported output formats"""
    return {
-        "output_formats": ["json", "markdown"],
+        "output_formats": ["json", "markdown", "html"],
        "image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
        "capabilities": [
            "Layout detection (PP-DocLayoutV2)",
@@ -356,6 +544,9 @@ async def parse_document_endpoint(request: ParseRequest):
        if request.output_format == "markdown":
            markdown = result_to_markdown(result)
            output = {"markdown": markdown}
+        elif request.output_format == "html":
+            html = result_to_html(result)
+            output = {"html": html}
        else:
            output = result

@@ -408,6 +599,8 @@ async def chat_completions(request: dict):

        if output_format == "markdown":
            content = result_to_markdown(result)
+        elif output_format == "html":
+            content = result_to_html(result)
        else:
            content = json.dumps(result, ensure_ascii=False, indent=2)