feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README

This commit is contained in:
2026-01-18 00:11:17 +00:00
parent 0d8a1ebac2
commit f0d88fcbe0
4 changed files with 486 additions and 82 deletions

View File

@@ -10,6 +10,7 @@ Provides REST API for document parsing using:
import os
import io
import re
import base64
import logging
import tempfile
@@ -261,23 +262,210 @@ def process_document(image: Image.Image) -> dict:
def result_to_markdown(result: dict) -> str:
"""Convert result to Markdown format"""
"""Convert result to Markdown format with structural hints for LLM processing.
Adds positional and type-based formatting to help downstream LLMs
understand document structure:
- Tables are marked with **[TABLE]** prefix
- Header zone content (top 15%) is bolded
- Footer zone content (bottom 15%) is separated with horizontal rule
- Titles are formatted as # headers
- Figures/charts are marked with *[Figure: ...]*
"""
lines = []
image_height = result.get("image_size", [0, 1000])[1]
for block in result.get("blocks", []):
block_type = block.get("type", "text")
content = block.get("content", "")
block_type = block.get("type", "text").lower()
content = block.get("content", "").strip()
bbox = block.get("bbox", [])
if "table" in block_type.lower():
lines.append(f"\n{content}\n")
elif "formula" in block_type.lower():
if not content:
continue
# Determine position zone (top 15%, middle, bottom 15%)
y_pos = bbox[1] if bbox and len(bbox) > 1 else 0
y_end = bbox[3] if bbox and len(bbox) > 3 else y_pos
is_header_zone = y_pos < image_height * 0.15
is_footer_zone = y_end > image_height * 0.85
# Format based on type and position
if "table" in block_type:
lines.append(f"\n**[TABLE]**\n{content}\n")
elif "title" in block_type:
lines.append(f"# {content}")
elif "formula" in block_type or "math" in block_type:
lines.append(f"\n$$\n{content}\n$$\n")
elif "figure" in block_type or "chart" in block_type:
lines.append(f"*[Figure: {content}]*")
elif is_header_zone:
lines.append(f"**{content}**")
elif is_footer_zone:
lines.append(f"---\n{content}")
else:
lines.append(content)
return "\n\n".join(lines)
def parse_markdown_table(content: str) -> str:
"""Convert table content to HTML table.
Handles:
- PaddleOCR-VL format: <fcel>cell<lcel>cell<nl> (detected by <fcel> tags)
- Pipe-delimited tables: | Header | Header |
- Separator rows: |---|---|
- Returns HTML <table> structure
"""
content_stripped = content.strip()
# Check for PaddleOCR-VL table format (<fcel>, <lcel>, <ecel>, <nl>)
if '<fcel>' in content_stripped or '<nl>' in content_stripped:
return parse_paddleocr_table(content_stripped)
lines = content_stripped.split('\n')
if not lines:
return f'<pre>{content}</pre>'
# Check if it looks like a markdown table
if not any('|' in line for line in lines):
return f'<pre>{content}</pre>'
html_rows = []
is_header = True
for line in lines:
line = line.strip()
if not line or line.startswith('|') == False and '|' not in line:
continue
# Skip separator rows (|---|---|)
if re.match(r'^[\|\s\-:]+$', line):
is_header = False
continue
# Parse cells
cells = [c.strip() for c in line.split('|')]
cells = [c for c in cells if c] # Remove empty from edges
if is_header:
row = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
html_rows.append(f'<thead>{row}</thead>')
is_header = False
else:
row = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
html_rows.append(row)
if html_rows:
# Wrap body rows in tbody
header = html_rows[0] if '<thead>' in html_rows[0] else ''
body_rows = [r for r in html_rows if '<thead>' not in r]
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
return f'<table>{header}{body}</table>'
return f'<pre>{content}</pre>'
def parse_paddleocr_table(content: str) -> str:
"""Convert PaddleOCR-VL table format to HTML table.
PaddleOCR-VL uses:
- <fcel> = first cell in a row
- <lcel> = subsequent cells
- <ecel> = empty cell
- <nl> = row separator (newline)
Example input:
<fcel>Header1<lcel>Header2<nl><fcel>Value1<lcel>Value2<nl>
"""
# Split into rows by <nl>
rows_raw = re.split(r'<nl>', content)
html_rows = []
is_first_row = True
for row_content in rows_raw:
row_content = row_content.strip()
if not row_content:
continue
# Extract cells: split by <fcel>, <lcel>, or <ecel>
# Each cell is the text between these markers
cells = []
# Pattern to match cell markers and capture content
# Content is everything between markers
parts = re.split(r'<fcel>|<lcel>|<ecel>', row_content)
for part in parts:
part = part.strip()
if part:
cells.append(part)
if not cells:
continue
# First row is header
if is_first_row:
row_html = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
html_rows.append(f'<thead>{row_html}</thead>')
is_first_row = False
else:
row_html = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
html_rows.append(row_html)
if html_rows:
header = html_rows[0] if '<thead>' in html_rows[0] else ''
body_rows = [r for r in html_rows if '<thead>' not in r]
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
return f'<table>{header}{body}</table>'
return f'<pre>{content}</pre>'
def result_to_html(result: dict) -> str:
"""Convert result to semantic HTML for optimal LLM processing.
Uses semantic HTML5 tags with position metadata as data-* attributes.
Markdown tables are converted to proper HTML <table> tags for
unambiguous parsing by downstream LLMs.
"""
parts = []
image_height = result.get("image_size", [0, 1000])[1]
parts.append('<!DOCTYPE html><html><body>')
for block in result.get("blocks", []):
block_type = block.get("type", "text").lower()
content = block.get("content", "").strip()
bbox = block.get("bbox", [])
if not content:
continue
# Position metadata
y_pos = bbox[1] / image_height if bbox and len(bbox) > 1 else 0
data_attrs = f'data-type="{block_type}" data-y="{y_pos:.2f}"'
# Format based on type
if "table" in block_type:
table_html = parse_markdown_table(content)
parts.append(f'<section {data_attrs} class="table-region">{table_html}</section>')
elif "title" in block_type:
parts.append(f'<h1 {data_attrs}>{content}</h1>')
elif "formula" in block_type or "math" in block_type:
parts.append(f'<div {data_attrs} class="formula"><code>{content}</code></div>')
elif "figure" in block_type or "chart" in block_type:
parts.append(f'<figure {data_attrs}><figcaption>{content}</figcaption></figure>')
elif y_pos < 0.15:
parts.append(f'<header {data_attrs}><strong>{content}</strong></header>')
elif y_pos > 0.85:
parts.append(f'<footer {data_attrs}>{content}</footer>')
else:
parts.append(f'<p {data_attrs}>{content}</p>')
parts.append('</body></html>')
return '\n'.join(parts)
# Request/Response models
class ParseRequest(BaseModel):
image: str # base64 encoded image
@@ -331,7 +519,7 @@ async def health_check():
async def supported_formats():
"""List supported output formats"""
return {
"output_formats": ["json", "markdown"],
"output_formats": ["json", "markdown", "html"],
"image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
"capabilities": [
"Layout detection (PP-DocLayoutV2)",
@@ -356,6 +544,9 @@ async def parse_document_endpoint(request: ParseRequest):
if request.output_format == "markdown":
markdown = result_to_markdown(result)
output = {"markdown": markdown}
elif request.output_format == "html":
html = result_to_html(result)
output = {"html": html}
else:
output = result
@@ -408,6 +599,8 @@ async def chat_completions(request: dict):
if output_format == "markdown":
content = result_to_markdown(result)
elif output_format == "html":
content = result_to_html(result)
else:
content = json.dumps(result, ensure_ascii=False, indent=2)