feat(paddleocr-vl): add structured HTML output and table parsing for PaddleOCR-VL, update API, tests, and README
This commit is contained in:
@@ -10,6 +10,7 @@ Provides REST API for document parsing using:
|
||||
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import base64
|
||||
import logging
|
||||
import tempfile
|
||||
@@ -261,23 +262,210 @@ def process_document(image: Image.Image) -> dict:
|
||||
|
||||
|
||||
def result_to_markdown(result: dict) -> str:
|
||||
"""Convert result to Markdown format"""
|
||||
"""Convert result to Markdown format with structural hints for LLM processing.
|
||||
|
||||
Adds positional and type-based formatting to help downstream LLMs
|
||||
understand document structure:
|
||||
- Tables are marked with **[TABLE]** prefix
|
||||
- Header zone content (top 15%) is bolded
|
||||
- Footer zone content (bottom 15%) is separated with horizontal rule
|
||||
- Titles are formatted as # headers
|
||||
- Figures/charts are marked with *[Figure: ...]*
|
||||
"""
|
||||
lines = []
|
||||
image_height = result.get("image_size", [0, 1000])[1]
|
||||
|
||||
for block in result.get("blocks", []):
|
||||
block_type = block.get("type", "text")
|
||||
content = block.get("content", "")
|
||||
block_type = block.get("type", "text").lower()
|
||||
content = block.get("content", "").strip()
|
||||
bbox = block.get("bbox", [])
|
||||
|
||||
if "table" in block_type.lower():
|
||||
lines.append(f"\n{content}\n")
|
||||
elif "formula" in block_type.lower():
|
||||
if not content:
|
||||
continue
|
||||
|
||||
# Determine position zone (top 15%, middle, bottom 15%)
|
||||
y_pos = bbox[1] if bbox and len(bbox) > 1 else 0
|
||||
y_end = bbox[3] if bbox and len(bbox) > 3 else y_pos
|
||||
is_header_zone = y_pos < image_height * 0.15
|
||||
is_footer_zone = y_end > image_height * 0.85
|
||||
|
||||
# Format based on type and position
|
||||
if "table" in block_type:
|
||||
lines.append(f"\n**[TABLE]**\n{content}\n")
|
||||
elif "title" in block_type:
|
||||
lines.append(f"# {content}")
|
||||
elif "formula" in block_type or "math" in block_type:
|
||||
lines.append(f"\n$$\n{content}\n$$\n")
|
||||
elif "figure" in block_type or "chart" in block_type:
|
||||
lines.append(f"*[Figure: {content}]*")
|
||||
elif is_header_zone:
|
||||
lines.append(f"**{content}**")
|
||||
elif is_footer_zone:
|
||||
lines.append(f"---\n{content}")
|
||||
else:
|
||||
lines.append(content)
|
||||
|
||||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
def parse_markdown_table(content: str) -> str:
|
||||
"""Convert table content to HTML table.
|
||||
|
||||
Handles:
|
||||
- PaddleOCR-VL format: <fcel>cell<lcel>cell<nl> (detected by <fcel> tags)
|
||||
- Pipe-delimited tables: | Header | Header |
|
||||
- Separator rows: |---|---|
|
||||
- Returns HTML <table> structure
|
||||
"""
|
||||
content_stripped = content.strip()
|
||||
|
||||
# Check for PaddleOCR-VL table format (<fcel>, <lcel>, <ecel>, <nl>)
|
||||
if '<fcel>' in content_stripped or '<nl>' in content_stripped:
|
||||
return parse_paddleocr_table(content_stripped)
|
||||
|
||||
lines = content_stripped.split('\n')
|
||||
if not lines:
|
||||
return f'<pre>{content}</pre>'
|
||||
|
||||
# Check if it looks like a markdown table
|
||||
if not any('|' in line for line in lines):
|
||||
return f'<pre>{content}</pre>'
|
||||
|
||||
html_rows = []
|
||||
is_header = True
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('|') == False and '|' not in line:
|
||||
continue
|
||||
|
||||
# Skip separator rows (|---|---|)
|
||||
if re.match(r'^[\|\s\-:]+$', line):
|
||||
is_header = False
|
||||
continue
|
||||
|
||||
# Parse cells
|
||||
cells = [c.strip() for c in line.split('|')]
|
||||
cells = [c for c in cells if c] # Remove empty from edges
|
||||
|
||||
if is_header:
|
||||
row = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
|
||||
html_rows.append(f'<thead>{row}</thead>')
|
||||
is_header = False
|
||||
else:
|
||||
row = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
|
||||
html_rows.append(row)
|
||||
|
||||
if html_rows:
|
||||
# Wrap body rows in tbody
|
||||
header = html_rows[0] if '<thead>' in html_rows[0] else ''
|
||||
body_rows = [r for r in html_rows if '<thead>' not in r]
|
||||
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
|
||||
return f'<table>{header}{body}</table>'
|
||||
|
||||
return f'<pre>{content}</pre>'
|
||||
|
||||
|
||||
def parse_paddleocr_table(content: str) -> str:
|
||||
"""Convert PaddleOCR-VL table format to HTML table.
|
||||
|
||||
PaddleOCR-VL uses:
|
||||
- <fcel> = first cell in a row
|
||||
- <lcel> = subsequent cells
|
||||
- <ecel> = empty cell
|
||||
- <nl> = row separator (newline)
|
||||
|
||||
Example input:
|
||||
<fcel>Header1<lcel>Header2<nl><fcel>Value1<lcel>Value2<nl>
|
||||
"""
|
||||
# Split into rows by <nl>
|
||||
rows_raw = re.split(r'<nl>', content)
|
||||
html_rows = []
|
||||
is_first_row = True
|
||||
|
||||
for row_content in rows_raw:
|
||||
row_content = row_content.strip()
|
||||
if not row_content:
|
||||
continue
|
||||
|
||||
# Extract cells: split by <fcel>, <lcel>, or <ecel>
|
||||
# Each cell is the text between these markers
|
||||
cells = []
|
||||
|
||||
# Pattern to match cell markers and capture content
|
||||
# Content is everything between markers
|
||||
parts = re.split(r'<fcel>|<lcel>|<ecel>', row_content)
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if part:
|
||||
cells.append(part)
|
||||
|
||||
if not cells:
|
||||
continue
|
||||
|
||||
# First row is header
|
||||
if is_first_row:
|
||||
row_html = '<tr>' + ''.join(f'<th>{c}</th>' for c in cells) + '</tr>'
|
||||
html_rows.append(f'<thead>{row_html}</thead>')
|
||||
is_first_row = False
|
||||
else:
|
||||
row_html = '<tr>' + ''.join(f'<td>{c}</td>' for c in cells) + '</tr>'
|
||||
html_rows.append(row_html)
|
||||
|
||||
if html_rows:
|
||||
header = html_rows[0] if '<thead>' in html_rows[0] else ''
|
||||
body_rows = [r for r in html_rows if '<thead>' not in r]
|
||||
body = f'<tbody>{"".join(body_rows)}</tbody>' if body_rows else ''
|
||||
return f'<table>{header}{body}</table>'
|
||||
|
||||
return f'<pre>{content}</pre>'
|
||||
|
||||
|
||||
def result_to_html(result: dict) -> str:
|
||||
"""Convert result to semantic HTML for optimal LLM processing.
|
||||
|
||||
Uses semantic HTML5 tags with position metadata as data-* attributes.
|
||||
Markdown tables are converted to proper HTML <table> tags for
|
||||
unambiguous parsing by downstream LLMs.
|
||||
"""
|
||||
parts = []
|
||||
image_height = result.get("image_size", [0, 1000])[1]
|
||||
|
||||
parts.append('<!DOCTYPE html><html><body>')
|
||||
|
||||
for block in result.get("blocks", []):
|
||||
block_type = block.get("type", "text").lower()
|
||||
content = block.get("content", "").strip()
|
||||
bbox = block.get("bbox", [])
|
||||
|
||||
if not content:
|
||||
continue
|
||||
|
||||
# Position metadata
|
||||
y_pos = bbox[1] / image_height if bbox and len(bbox) > 1 else 0
|
||||
data_attrs = f'data-type="{block_type}" data-y="{y_pos:.2f}"'
|
||||
|
||||
# Format based on type
|
||||
if "table" in block_type:
|
||||
table_html = parse_markdown_table(content)
|
||||
parts.append(f'<section {data_attrs} class="table-region">{table_html}</section>')
|
||||
elif "title" in block_type:
|
||||
parts.append(f'<h1 {data_attrs}>{content}</h1>')
|
||||
elif "formula" in block_type or "math" in block_type:
|
||||
parts.append(f'<div {data_attrs} class="formula"><code>{content}</code></div>')
|
||||
elif "figure" in block_type or "chart" in block_type:
|
||||
parts.append(f'<figure {data_attrs}><figcaption>{content}</figcaption></figure>')
|
||||
elif y_pos < 0.15:
|
||||
parts.append(f'<header {data_attrs}><strong>{content}</strong></header>')
|
||||
elif y_pos > 0.85:
|
||||
parts.append(f'<footer {data_attrs}>{content}</footer>')
|
||||
else:
|
||||
parts.append(f'<p {data_attrs}>{content}</p>')
|
||||
|
||||
parts.append('</body></html>')
|
||||
return '\n'.join(parts)
|
||||
|
||||
|
||||
# Request/Response models
|
||||
class ParseRequest(BaseModel):
|
||||
image: str # base64 encoded image
|
||||
@@ -331,7 +519,7 @@ async def health_check():
|
||||
async def supported_formats():
|
||||
"""List supported output formats"""
|
||||
return {
|
||||
"output_formats": ["json", "markdown"],
|
||||
"output_formats": ["json", "markdown", "html"],
|
||||
"image_formats": ["PNG", "JPEG", "WebP", "BMP", "GIF", "TIFF"],
|
||||
"capabilities": [
|
||||
"Layout detection (PP-DocLayoutV2)",
|
||||
@@ -356,6 +544,9 @@ async def parse_document_endpoint(request: ParseRequest):
|
||||
if request.output_format == "markdown":
|
||||
markdown = result_to_markdown(result)
|
||||
output = {"markdown": markdown}
|
||||
elif request.output_format == "html":
|
||||
html = result_to_html(result)
|
||||
output = {"html": html}
|
||||
else:
|
||||
output = result
|
||||
|
||||
@@ -408,6 +599,8 @@ async def chat_completions(request: dict):
|
||||
|
||||
if output_format == "markdown":
|
||||
content = result_to_markdown(result)
|
||||
elif output_format == "html":
|
||||
content = result_to_html(result)
|
||||
else:
|
||||
content = json.dumps(result, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user