Files
LegacyHUB/app/ingestion/docling_extractor.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

385 lines
11 KiB
Python

"""Docling structured extraction.
Docling produces a hierarchical document model with reading order, layout, tables
and figures. We export both Markdown and a JSON representation, then walk the
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
figure caption) for downstream chunking.
The extractor is intentionally defensive: Docling's exact Python API has
shifted across releases. We probe for the safest exporter methods and fall
back to ``str(document)`` only as a last resort.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
@dataclass
class ExtractedBlock:
page_number: int
block_type: str
text: str
block_id: str | None = None
extra: dict[str, Any] = field(default_factory=dict)
@dataclass
class ExtractedTable:
page_number: int
table_index: int
markdown: str
csv_text: str | None = None
json_data: dict[str, Any] | None = None
block_id: str | None = None
@dataclass
class ExtractedFigure:
page_number: int
figure_index: int
caption: str | None
block_id: str | None = None
image_bytes: bytes | None = None
image_ext: str = "png"
@dataclass
class ExtractedPage:
page_number: int
text: str
has_tables: bool = False
has_figures: bool = False
has_handwriting: bool = False
ocr_confidence: float | None = None
@dataclass
class ExtractionResult:
markdown: str
json_payload: dict[str, Any]
blocks: list[ExtractedBlock]
tables: list[ExtractedTable]
figures: list[ExtractedFigure]
pages: list[ExtractedPage]
def extract(pdf_path: Path) -> ExtractionResult:
"""Run Docling on ``pdf_path`` and return a normalized result."""
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
pipeline_options = PdfPipelineOptions()
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
pipeline_options.do_ocr = settings.docling_ocr_enabled
pipeline_options.do_table_structure = True
try:
pipeline_options.table_structure_options.do_cell_matching = True
except Exception: # noqa: BLE001 - older docling versions lack this
pass
try:
pipeline_options.generate_page_images = True
except Exception: # noqa: BLE001
pass
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
logger.info("docling.start", input=str(pdf_path))
conv = converter.convert(str(pdf_path))
doc = conv.document
markdown = _safe_export_markdown(doc)
json_payload = _safe_export_dict(doc)
blocks = _walk_blocks(json_payload)
tables = _walk_tables(doc, json_payload)
figures = _walk_figures(doc, json_payload)
pages = _walk_pages(json_payload, blocks, tables, figures)
logger.info(
"docling.done",
pages=len(pages),
blocks=len(blocks),
tables=len(tables),
figures=len(figures),
)
return ExtractionResult(
markdown=markdown,
json_payload=json_payload,
blocks=blocks,
tables=tables,
figures=figures,
pages=pages,
)
# ---------------- Internal helpers ----------------
def _safe_export_markdown(doc: Any) -> str:
for attr in ("export_to_markdown", "to_markdown"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
return fn()
except Exception: # noqa: BLE001
continue
return str(doc)
def _safe_export_dict(doc: Any) -> dict[str, Any]:
for attr in ("export_to_dict", "model_dump", "dict"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
data = fn()
if isinstance(data, dict):
return data
except Exception: # noqa: BLE001
continue
# Last resort: serialize via JSON round-trip
try:
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
except Exception: # noqa: BLE001
return {}
_DOCLING_LABEL_TO_BLOCK = {
"title": "title",
"section_header": "heading",
"section-header": "heading",
"subtitle": "heading",
"page_header": "heading",
"header": "heading",
"list_item": "list",
"list-item": "list",
"list": "list",
"paragraph": "paragraph",
"text": "paragraph",
"caption": "figure_caption",
"figure": "figure_caption",
"table": "table",
"footnote": "paragraph",
}
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
"""Flatten Docling's text items into ordered blocks per page."""
blocks: list[ExtractedBlock] = []
items = (
payload.get("texts")
or payload.get("text_items")
or payload.get("body", {}).get("text_items", [])
or []
)
if not isinstance(items, list):
return blocks
for item in items:
if not isinstance(item, dict):
continue
label = (item.get("label") or item.get("category") or "paragraph").lower()
text = (item.get("text") or "").strip()
if not text:
continue
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
page = _page_of(item)
blocks.append(
ExtractedBlock(
page_number=page,
block_type=block_type,
text=text,
block_id=item.get("self_ref") or item.get("id"),
extra={"label": label},
)
)
return blocks
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
tables: list[ExtractedTable] = []
raw_tables = payload.get("tables") or []
for idx, t in enumerate(raw_tables):
if not isinstance(t, dict):
continue
page = _page_of(t)
md = _table_markdown(doc, t, idx)
csv_text = _table_csv(t)
tables.append(
ExtractedTable(
page_number=page,
table_index=idx,
markdown=md,
csv_text=csv_text,
json_data=t,
block_id=t.get("self_ref") or t.get("id"),
)
)
return tables
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
figures: list[ExtractedFigure] = []
raw_figures = payload.get("pictures") or payload.get("figures") or []
for idx, f in enumerate(raw_figures):
if not isinstance(f, dict):
continue
page = _page_of(f)
caption = (f.get("caption") or "").strip() or None
figures.append(
ExtractedFigure(
page_number=page,
figure_index=idx,
caption=caption,
block_id=f.get("self_ref") or f.get("id"),
)
)
return figures
def _walk_pages(
payload: dict[str, Any],
blocks: list[ExtractedBlock],
tables: list[ExtractedTable],
figures: list[ExtractedFigure],
) -> list[ExtractedPage]:
pages_meta = payload.get("pages") or {}
page_numbers: set[int] = set()
if isinstance(pages_meta, dict):
for k in pages_meta.keys():
try:
page_numbers.add(int(k))
except (ValueError, TypeError):
continue
elif isinstance(pages_meta, list):
for p in pages_meta:
if isinstance(p, dict):
pn = p.get("page_no") or p.get("page") or p.get("number")
if isinstance(pn, int):
page_numbers.add(pn)
for b in blocks:
page_numbers.add(b.page_number)
for t in tables:
page_numbers.add(t.page_number)
for f in figures:
page_numbers.add(f.page_number)
page_numbers.discard(0)
if not page_numbers:
page_numbers = {1}
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
for b in blocks:
by_page_text.setdefault(b.page_number, []).append(b.text)
has_tables_set = {t.page_number for t in tables}
has_figures_set = {f.page_number for f in figures}
return [
ExtractedPage(
page_number=pn,
text="\n\n".join(by_page_text.get(pn, [])),
has_tables=pn in has_tables_set,
has_figures=pn in has_figures_set,
)
for pn in sorted(page_numbers)
]
def _page_of(item: dict[str, Any]) -> int:
prov = item.get("prov") or item.get("provenance")
if isinstance(prov, list) and prov:
first = prov[0]
if isinstance(first, dict):
pn = first.get("page_no") or first.get("page") or first.get("page_number")
if isinstance(pn, int):
return pn
pn = item.get("page_no") or item.get("page") or item.get("page_number")
if isinstance(pn, int):
return pn
return 1
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
# Try Docling's own export first (per-table).
try:
export = getattr(doc, "export_table_to_markdown", None)
if callable(export):
return export(idx)
except Exception: # noqa: BLE001
pass
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
if isinstance(grid, list) and grid and isinstance(grid[0], list):
return _grid_to_markdown(grid)
cells = raw.get("table_cells")
if isinstance(cells, list):
return _cells_to_markdown(cells)
return ""
def _grid_to_markdown(grid: list[list[Any]]) -> str:
if not grid:
return ""
def _cell(c: Any) -> str:
if isinstance(c, dict):
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
return str(c).replace("|", "\\|").strip()
header = grid[0]
body = grid[1:] if len(grid) > 1 else []
cols = len(header)
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
out.append("| " + " | ".join(["---"] * cols) + " |")
for row in body:
cells = [_cell(c) for c in row]
if len(cells) < cols:
cells += [""] * (cols - len(cells))
out.append("| " + " | ".join(cells[:cols]) + " |")
return "\n".join(out)
def _cells_to_markdown(cells: list[Any]) -> str:
rows: dict[int, dict[int, str]] = {}
for c in cells:
if not isinstance(c, dict):
continue
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
if not rows:
return ""
max_col = max((max(r.keys()) for r in rows.values()), default=0)
grid = []
for r_idx in sorted(rows):
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
grid.append(row)
return _grid_to_markdown(grid)
def _table_csv(raw: dict[str, Any]) -> str | None:
grid = raw.get("data") or raw.get("grid")
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
return None
import csv
import io
buf = io.StringIO()
writer = csv.writer(buf)
for row in grid:
writer.writerow([
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
])
return buf.getvalue()