"""Docling structured extraction. Docling produces a hierarchical document model with reading order, layout, tables and figures. We export both Markdown and a JSON representation, then walk the JSON to emit normalized blocks (title, heading, paragraph, list, table caption, figure caption) for downstream chunking. The extractor is intentionally defensive: Docling's exact Python API has shifted across releases. We probe for the safest exporter methods and fall back to ``str(document)`` only as a last resort. """ from __future__ import annotations import json from dataclasses import dataclass, field from pathlib import Path from typing import Any from app.config import settings from app.logging_config import get_logger logger = get_logger(__name__) @dataclass class ExtractedBlock: page_number: int block_type: str text: str block_id: str | None = None extra: dict[str, Any] = field(default_factory=dict) @dataclass class ExtractedTable: page_number: int table_index: int markdown: str csv_text: str | None = None json_data: dict[str, Any] | None = None block_id: str | None = None @dataclass class ExtractedFigure: page_number: int figure_index: int caption: str | None block_id: str | None = None image_bytes: bytes | None = None image_ext: str = "png" @dataclass class ExtractedPage: page_number: int text: str has_tables: bool = False has_figures: bool = False has_handwriting: bool = False ocr_confidence: float | None = None @dataclass class ExtractionResult: markdown: str json_payload: dict[str, Any] blocks: list[ExtractedBlock] tables: list[ExtractedTable] figures: list[ExtractedFigure] pages: list[ExtractedPage] def extract(pdf_path: Path) -> ExtractionResult: """Run Docling on ``pdf_path`` and return a normalized result.""" from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption pipeline_options = PdfPipelineOptions() # We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in. pipeline_options.do_ocr = settings.docling_ocr_enabled pipeline_options.do_table_structure = True try: pipeline_options.table_structure_options.do_cell_matching = True except Exception: # noqa: BLE001 - older docling versions lack this pass try: pipeline_options.generate_page_images = True except Exception: # noqa: BLE001 pass converter = DocumentConverter( format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} ) logger.info("docling.start", input=str(pdf_path)) conv = converter.convert(str(pdf_path)) doc = conv.document markdown = _safe_export_markdown(doc) json_payload = _safe_export_dict(doc) blocks = _walk_blocks(json_payload) tables = _walk_tables(doc, json_payload) figures = _walk_figures(doc, json_payload) pages = _walk_pages(json_payload, blocks, tables, figures) logger.info( "docling.done", pages=len(pages), blocks=len(blocks), tables=len(tables), figures=len(figures), ) return ExtractionResult( markdown=markdown, json_payload=json_payload, blocks=blocks, tables=tables, figures=figures, pages=pages, ) # ---------------- Internal helpers ---------------- def _safe_export_markdown(doc: Any) -> str: for attr in ("export_to_markdown", "to_markdown"): fn = getattr(doc, attr, None) if callable(fn): try: return fn() except Exception: # noqa: BLE001 continue return str(doc) def _safe_export_dict(doc: Any) -> dict[str, Any]: for attr in ("export_to_dict", "model_dump", "dict"): fn = getattr(doc, attr, None) if callable(fn): try: data = fn() if isinstance(data, dict): return data except Exception: # noqa: BLE001 continue # Last resort: serialize via JSON round-trip try: return json.loads(getattr(doc, "model_dump_json", lambda: "{}")()) except Exception: # noqa: BLE001 return {} _DOCLING_LABEL_TO_BLOCK = { "title": "title", "section_header": "heading", "section-header": "heading", "subtitle": "heading", "page_header": "heading", "header": "heading", "list_item": "list", "list-item": "list", "list": "list", "paragraph": "paragraph", "text": "paragraph", "caption": "figure_caption", "figure": "figure_caption", "table": "table", "footnote": "paragraph", } def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]: """Flatten Docling's text items into ordered blocks per page.""" blocks: list[ExtractedBlock] = [] items = ( payload.get("texts") or payload.get("text_items") or payload.get("body", {}).get("text_items", []) or [] ) if not isinstance(items, list): return blocks for item in items: if not isinstance(item, dict): continue label = (item.get("label") or item.get("category") or "paragraph").lower() text = (item.get("text") or "").strip() if not text: continue block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph") page = _page_of(item) blocks.append( ExtractedBlock( page_number=page, block_type=block_type, text=text, block_id=item.get("self_ref") or item.get("id"), extra={"label": label}, ) ) return blocks def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]: tables: list[ExtractedTable] = [] raw_tables = payload.get("tables") or [] for idx, t in enumerate(raw_tables): if not isinstance(t, dict): continue page = _page_of(t) md = _table_markdown(doc, t, idx) csv_text = _table_csv(t) tables.append( ExtractedTable( page_number=page, table_index=idx, markdown=md, csv_text=csv_text, json_data=t, block_id=t.get("self_ref") or t.get("id"), ) ) return tables def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]: figures: list[ExtractedFigure] = [] raw_figures = payload.get("pictures") or payload.get("figures") or [] for idx, f in enumerate(raw_figures): if not isinstance(f, dict): continue page = _page_of(f) caption = (f.get("caption") or "").strip() or None figures.append( ExtractedFigure( page_number=page, figure_index=idx, caption=caption, block_id=f.get("self_ref") or f.get("id"), ) ) return figures def _walk_pages( payload: dict[str, Any], blocks: list[ExtractedBlock], tables: list[ExtractedTable], figures: list[ExtractedFigure], ) -> list[ExtractedPage]: pages_meta = payload.get("pages") or {} page_numbers: set[int] = set() if isinstance(pages_meta, dict): for k in pages_meta.keys(): try: page_numbers.add(int(k)) except (ValueError, TypeError): continue elif isinstance(pages_meta, list): for p in pages_meta: if isinstance(p, dict): pn = p.get("page_no") or p.get("page") or p.get("number") if isinstance(pn, int): page_numbers.add(pn) for b in blocks: page_numbers.add(b.page_number) for t in tables: page_numbers.add(t.page_number) for f in figures: page_numbers.add(f.page_number) page_numbers.discard(0) if not page_numbers: page_numbers = {1} by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers} for b in blocks: by_page_text.setdefault(b.page_number, []).append(b.text) has_tables_set = {t.page_number for t in tables} has_figures_set = {f.page_number for f in figures} return [ ExtractedPage( page_number=pn, text="\n\n".join(by_page_text.get(pn, [])), has_tables=pn in has_tables_set, has_figures=pn in has_figures_set, ) for pn in sorted(page_numbers) ] def _page_of(item: dict[str, Any]) -> int: prov = item.get("prov") or item.get("provenance") if isinstance(prov, list) and prov: first = prov[0] if isinstance(first, dict): pn = first.get("page_no") or first.get("page") or first.get("page_number") if isinstance(pn, int): return pn pn = item.get("page_no") or item.get("page") or item.get("page_number") if isinstance(pn, int): return pn return 1 def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str: # Try Docling's own export first (per-table). try: export = getattr(doc, "export_table_to_markdown", None) if callable(export): return export(idx) except Exception: # noqa: BLE001 pass grid = raw.get("data") or raw.get("table_cells") or raw.get("grid") if isinstance(grid, list) and grid and isinstance(grid[0], list): return _grid_to_markdown(grid) cells = raw.get("table_cells") if isinstance(cells, list): return _cells_to_markdown(cells) return "" def _grid_to_markdown(grid: list[list[Any]]) -> str: if not grid: return "" def _cell(c: Any) -> str: if isinstance(c, dict): return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip() return str(c).replace("|", "\\|").strip() header = grid[0] body = grid[1:] if len(grid) > 1 else [] cols = len(header) out = ["| " + " | ".join(_cell(c) for c in header) + " |"] out.append("| " + " | ".join(["---"] * cols) + " |") for row in body: cells = [_cell(c) for c in row] if len(cells) < cols: cells += [""] * (cols - len(cells)) out.append("| " + " | ".join(cells[:cols]) + " |") return "\n".join(out) def _cells_to_markdown(cells: list[Any]) -> str: rows: dict[int, dict[int, str]] = {} for c in cells: if not isinstance(c, dict): continue r = c.get("start_row_offset_idx", c.get("row", 0)) or 0 col = c.get("start_col_offset_idx", c.get("col", 0)) or 0 rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip() if not rows: return "" max_col = max((max(r.keys()) for r in rows.values()), default=0) grid = [] for r_idx in sorted(rows): row = [rows[r_idx].get(c, "") for c in range(max_col + 1)] grid.append(row) return _grid_to_markdown(grid) def _table_csv(raw: dict[str, Any]) -> str | None: grid = raw.get("data") or raw.get("grid") if not (isinstance(grid, list) and grid and isinstance(grid[0], list)): return None import csv import io buf = io.StringIO() writer = csv.writer(buf) for row in grid: writer.writerow([ (c.get("text") if isinstance(c, dict) else c) or "" for c in row ]) return buf.getvalue()