LegacyHUB/app/ingestion/docling_extractor.py

"""Docling structured extraction.

Docling produces a hierarchical document model with reading order, layout, tables
and figures. We export both Markdown and a JSON representation, then walk the
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
figure caption) for downstream chunking.

The extractor is intentionally defensive: Docling's exact Python API has
shifted across releases. We probe for the safest exporter methods and fall
back to ``str(document)`` only as a last resort.
"""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from app.config import settings
from app.logging_config import get_logger

logger = get_logger(__name__)


@dataclass
class ExtractedBlock:
    page_number: int
    block_type: str
    text: str
    block_id: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)


@dataclass
class ExtractedTable:
    page_number: int
    table_index: int
    markdown: str
    csv_text: str | None = None
    json_data: dict[str, Any] | None = None
    block_id: str | None = None


@dataclass
class ExtractedFigure:
    page_number: int
    figure_index: int
    caption: str | None
    block_id: str | None = None
    image_bytes: bytes | None = None
    image_ext: str = "png"


@dataclass
class ExtractedPage:
    page_number: int
    text: str
    has_tables: bool = False
    has_figures: bool = False
    has_handwriting: bool = False
    ocr_confidence: float | None = None


@dataclass
class ExtractionResult:
    markdown: str
    json_payload: dict[str, Any]
    blocks: list[ExtractedBlock]
    tables: list[ExtractedTable]
    figures: list[ExtractedFigure]
    pages: list[ExtractedPage]


def extract(pdf_path: Path) -> ExtractionResult:
    """Run Docling on ``pdf_path`` and return a normalized result."""
    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.document_converter import DocumentConverter, PdfFormatOption

    pipeline_options = PdfPipelineOptions()
    # We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
    pipeline_options.do_ocr = settings.docling_ocr_enabled
    pipeline_options.do_table_structure = True
    try:
        pipeline_options.table_structure_options.do_cell_matching = True
    except Exception:  # noqa: BLE001 - older docling versions lack this
        pass
    try:
        pipeline_options.generate_page_images = True
    except Exception:  # noqa: BLE001
        pass

    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )

    logger.info("docling.start", input=str(pdf_path))
    conv = converter.convert(str(pdf_path))
    doc = conv.document

    markdown = _safe_export_markdown(doc)
    json_payload = _safe_export_dict(doc)

    blocks = _walk_blocks(json_payload)
    tables = _walk_tables(doc, json_payload)
    figures = _walk_figures(doc, json_payload)
    pages = _walk_pages(json_payload, blocks, tables, figures)

    logger.info(
        "docling.done",
        pages=len(pages),
        blocks=len(blocks),
        tables=len(tables),
        figures=len(figures),
    )
    return ExtractionResult(
        markdown=markdown,
        json_payload=json_payload,
        blocks=blocks,
        tables=tables,
        figures=figures,
        pages=pages,
    )


# ---------------- Internal helpers ----------------

def _safe_export_markdown(doc: Any) -> str:
    for attr in ("export_to_markdown", "to_markdown"):
        fn = getattr(doc, attr, None)
        if callable(fn):
            try:
                return fn()
            except Exception:  # noqa: BLE001
                continue
    return str(doc)


def _safe_export_dict(doc: Any) -> dict[str, Any]:
    for attr in ("export_to_dict", "model_dump", "dict"):
        fn = getattr(doc, attr, None)
        if callable(fn):
            try:
                data = fn()
                if isinstance(data, dict):
                    return data
            except Exception:  # noqa: BLE001
                continue
    # Last resort: serialize via JSON round-trip
    try:
        return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
    except Exception:  # noqa: BLE001
        return {}


_DOCLING_LABEL_TO_BLOCK = {
    "title": "title",
    "section_header": "heading",
    "section-header": "heading",
    "subtitle": "heading",
    "page_header": "heading",
    "header": "heading",
    "list_item": "list",
    "list-item": "list",
    "list": "list",
    "paragraph": "paragraph",
    "text": "paragraph",
    "caption": "figure_caption",
    "figure": "figure_caption",
    "table": "table",
    "footnote": "paragraph",
}


def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
    """Flatten Docling's text items into ordered blocks per page."""
    blocks: list[ExtractedBlock] = []
    items = (
        payload.get("texts")
        or payload.get("text_items")
        or payload.get("body", {}).get("text_items", [])
        or []
    )
    if not isinstance(items, list):
        return blocks

    for item in items:
        if not isinstance(item, dict):
            continue
        label = (item.get("label") or item.get("category") or "paragraph").lower()
        text = (item.get("text") or "").strip()
        if not text:
            continue
        block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
        page = _page_of(item)
        blocks.append(
            ExtractedBlock(
                page_number=page,
                block_type=block_type,
                text=text,
                block_id=item.get("self_ref") or item.get("id"),
                extra={"label": label},
            )
        )
    return blocks


def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
    tables: list[ExtractedTable] = []
    raw_tables = payload.get("tables") or []
    for idx, t in enumerate(raw_tables):
        if not isinstance(t, dict):
            continue
        page = _page_of(t)
        md = _table_markdown(doc, t, idx)
        csv_text = _table_csv(t)
        tables.append(
            ExtractedTable(
                page_number=page,
                table_index=idx,
                markdown=md,
                csv_text=csv_text,
                json_data=t,
                block_id=t.get("self_ref") or t.get("id"),
            )
        )
    return tables


def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
    figures: list[ExtractedFigure] = []
    raw_figures = payload.get("pictures") or payload.get("figures") or []
    for idx, f in enumerate(raw_figures):
        if not isinstance(f, dict):
            continue
        page = _page_of(f)
        caption = (f.get("caption") or "").strip() or None
        figures.append(
            ExtractedFigure(
                page_number=page,
                figure_index=idx,
                caption=caption,
                block_id=f.get("self_ref") or f.get("id"),
            )
        )
    return figures


def _walk_pages(
    payload: dict[str, Any],
    blocks: list[ExtractedBlock],
    tables: list[ExtractedTable],
    figures: list[ExtractedFigure],
) -> list[ExtractedPage]:
    pages_meta = payload.get("pages") or {}
    page_numbers: set[int] = set()
    if isinstance(pages_meta, dict):
        for k in pages_meta.keys():
            try:
                page_numbers.add(int(k))
            except (ValueError, TypeError):
                continue
    elif isinstance(pages_meta, list):
        for p in pages_meta:
            if isinstance(p, dict):
                pn = p.get("page_no") or p.get("page") or p.get("number")
                if isinstance(pn, int):
                    page_numbers.add(pn)

    for b in blocks:
        page_numbers.add(b.page_number)
    for t in tables:
        page_numbers.add(t.page_number)
    for f in figures:
        page_numbers.add(f.page_number)
    page_numbers.discard(0)
    if not page_numbers:
        page_numbers = {1}

    by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
    for b in blocks:
        by_page_text.setdefault(b.page_number, []).append(b.text)

    has_tables_set = {t.page_number for t in tables}
    has_figures_set = {f.page_number for f in figures}

    return [
        ExtractedPage(
            page_number=pn,
            text="\n\n".join(by_page_text.get(pn, [])),
            has_tables=pn in has_tables_set,
            has_figures=pn in has_figures_set,
        )
        for pn in sorted(page_numbers)
    ]


def _page_of(item: dict[str, Any]) -> int:
    prov = item.get("prov") or item.get("provenance")
    if isinstance(prov, list) and prov:
        first = prov[0]
        if isinstance(first, dict):
            pn = first.get("page_no") or first.get("page") or first.get("page_number")
            if isinstance(pn, int):
                return pn
    pn = item.get("page_no") or item.get("page") or item.get("page_number")
    if isinstance(pn, int):
        return pn
    return 1


def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
    # Try Docling's own export first (per-table).
    try:
        export = getattr(doc, "export_table_to_markdown", None)
        if callable(export):
            return export(idx)
    except Exception:  # noqa: BLE001
        pass

    grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
    if isinstance(grid, list) and grid and isinstance(grid[0], list):
        return _grid_to_markdown(grid)
    cells = raw.get("table_cells")
    if isinstance(cells, list):
        return _cells_to_markdown(cells)
    return ""


def _grid_to_markdown(grid: list[list[Any]]) -> str:
    if not grid:
        return ""

    def _cell(c: Any) -> str:
        if isinstance(c, dict):
            return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
        return str(c).replace("|", "\\|").strip()

    header = grid[0]
    body = grid[1:] if len(grid) > 1 else []
    cols = len(header)
    out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
    out.append("| " + " | ".join(["---"] * cols) + " |")
    for row in body:
        cells = [_cell(c) for c in row]
        if len(cells) < cols:
            cells += [""] * (cols - len(cells))
        out.append("| " + " | ".join(cells[:cols]) + " |")
    return "\n".join(out)


def _cells_to_markdown(cells: list[Any]) -> str:
    rows: dict[int, dict[int, str]] = {}
    for c in cells:
        if not isinstance(c, dict):
            continue
        r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
        col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
        rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
    if not rows:
        return ""
    max_col = max((max(r.keys()) for r in rows.values()), default=0)
    grid = []
    for r_idx in sorted(rows):
        row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
        grid.append(row)
    return _grid_to_markdown(grid)


def _table_csv(raw: dict[str, Any]) -> str | None:
    grid = raw.get("data") or raw.get("grid")
    if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
        return None
    import csv
    import io

    buf = io.StringIO()
    writer = csv.writer(buf)
    for row in grid:
        writer.writerow([
            (c.get("text") if isinstance(c, dict) else c) or "" for c in row
        ])
    return buf.getvalue()