chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
384
app/ingestion/docling_extractor.py
Normal file
384
app/ingestion/docling_extractor.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""Docling structured extraction.
|
||||
|
||||
Docling produces a hierarchical document model with reading order, layout, tables
|
||||
and figures. We export both Markdown and a JSON representation, then walk the
|
||||
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
|
||||
figure caption) for downstream chunking.
|
||||
|
||||
The extractor is intentionally defensive: Docling's exact Python API has
|
||||
shifted across releases. We probe for the safest exporter methods and fall
|
||||
back to ``str(document)`` only as a last resort.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedBlock:
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
block_id: str | None = None
|
||||
extra: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedTable:
|
||||
page_number: int
|
||||
table_index: int
|
||||
markdown: str
|
||||
csv_text: str | None = None
|
||||
json_data: dict[str, Any] | None = None
|
||||
block_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedFigure:
|
||||
page_number: int
|
||||
figure_index: int
|
||||
caption: str | None
|
||||
block_id: str | None = None
|
||||
image_bytes: bytes | None = None
|
||||
image_ext: str = "png"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedPage:
|
||||
page_number: int
|
||||
text: str
|
||||
has_tables: bool = False
|
||||
has_figures: bool = False
|
||||
has_handwriting: bool = False
|
||||
ocr_confidence: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
markdown: str
|
||||
json_payload: dict[str, Any]
|
||||
blocks: list[ExtractedBlock]
|
||||
tables: list[ExtractedTable]
|
||||
figures: list[ExtractedFigure]
|
||||
pages: list[ExtractedPage]
|
||||
|
||||
|
||||
def extract(pdf_path: Path) -> ExtractionResult:
|
||||
"""Run Docling on ``pdf_path`` and return a normalized result."""
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
|
||||
pipeline_options.do_ocr = settings.docling_ocr_enabled
|
||||
pipeline_options.do_table_structure = True
|
||||
try:
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
except Exception: # noqa: BLE001 - older docling versions lack this
|
||||
pass
|
||||
try:
|
||||
pipeline_options.generate_page_images = True
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
||||
)
|
||||
|
||||
logger.info("docling.start", input=str(pdf_path))
|
||||
conv = converter.convert(str(pdf_path))
|
||||
doc = conv.document
|
||||
|
||||
markdown = _safe_export_markdown(doc)
|
||||
json_payload = _safe_export_dict(doc)
|
||||
|
||||
blocks = _walk_blocks(json_payload)
|
||||
tables = _walk_tables(doc, json_payload)
|
||||
figures = _walk_figures(doc, json_payload)
|
||||
pages = _walk_pages(json_payload, blocks, tables, figures)
|
||||
|
||||
logger.info(
|
||||
"docling.done",
|
||||
pages=len(pages),
|
||||
blocks=len(blocks),
|
||||
tables=len(tables),
|
||||
figures=len(figures),
|
||||
)
|
||||
return ExtractionResult(
|
||||
markdown=markdown,
|
||||
json_payload=json_payload,
|
||||
blocks=blocks,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
pages=pages,
|
||||
)
|
||||
|
||||
|
||||
# ---------------- Internal helpers ----------------
|
||||
|
||||
def _safe_export_markdown(doc: Any) -> str:
|
||||
for attr in ("export_to_markdown", "to_markdown"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
return fn()
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
return str(doc)
|
||||
|
||||
|
||||
def _safe_export_dict(doc: Any) -> dict[str, Any]:
|
||||
for attr in ("export_to_dict", "model_dump", "dict"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
data = fn()
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
# Last resort: serialize via JSON round-trip
|
||||
try:
|
||||
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
|
||||
except Exception: # noqa: BLE001
|
||||
return {}
|
||||
|
||||
|
||||
_DOCLING_LABEL_TO_BLOCK = {
|
||||
"title": "title",
|
||||
"section_header": "heading",
|
||||
"section-header": "heading",
|
||||
"subtitle": "heading",
|
||||
"page_header": "heading",
|
||||
"header": "heading",
|
||||
"list_item": "list",
|
||||
"list-item": "list",
|
||||
"list": "list",
|
||||
"paragraph": "paragraph",
|
||||
"text": "paragraph",
|
||||
"caption": "figure_caption",
|
||||
"figure": "figure_caption",
|
||||
"table": "table",
|
||||
"footnote": "paragraph",
|
||||
}
|
||||
|
||||
|
||||
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
|
||||
"""Flatten Docling's text items into ordered blocks per page."""
|
||||
blocks: list[ExtractedBlock] = []
|
||||
items = (
|
||||
payload.get("texts")
|
||||
or payload.get("text_items")
|
||||
or payload.get("body", {}).get("text_items", [])
|
||||
or []
|
||||
)
|
||||
if not isinstance(items, list):
|
||||
return blocks
|
||||
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
label = (item.get("label") or item.get("category") or "paragraph").lower()
|
||||
text = (item.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
|
||||
page = _page_of(item)
|
||||
blocks.append(
|
||||
ExtractedBlock(
|
||||
page_number=page,
|
||||
block_type=block_type,
|
||||
text=text,
|
||||
block_id=item.get("self_ref") or item.get("id"),
|
||||
extra={"label": label},
|
||||
)
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
|
||||
tables: list[ExtractedTable] = []
|
||||
raw_tables = payload.get("tables") or []
|
||||
for idx, t in enumerate(raw_tables):
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
page = _page_of(t)
|
||||
md = _table_markdown(doc, t, idx)
|
||||
csv_text = _table_csv(t)
|
||||
tables.append(
|
||||
ExtractedTable(
|
||||
page_number=page,
|
||||
table_index=idx,
|
||||
markdown=md,
|
||||
csv_text=csv_text,
|
||||
json_data=t,
|
||||
block_id=t.get("self_ref") or t.get("id"),
|
||||
)
|
||||
)
|
||||
return tables
|
||||
|
||||
|
||||
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
|
||||
figures: list[ExtractedFigure] = []
|
||||
raw_figures = payload.get("pictures") or payload.get("figures") or []
|
||||
for idx, f in enumerate(raw_figures):
|
||||
if not isinstance(f, dict):
|
||||
continue
|
||||
page = _page_of(f)
|
||||
caption = (f.get("caption") or "").strip() or None
|
||||
figures.append(
|
||||
ExtractedFigure(
|
||||
page_number=page,
|
||||
figure_index=idx,
|
||||
caption=caption,
|
||||
block_id=f.get("self_ref") or f.get("id"),
|
||||
)
|
||||
)
|
||||
return figures
|
||||
|
||||
|
||||
def _walk_pages(
|
||||
payload: dict[str, Any],
|
||||
blocks: list[ExtractedBlock],
|
||||
tables: list[ExtractedTable],
|
||||
figures: list[ExtractedFigure],
|
||||
) -> list[ExtractedPage]:
|
||||
pages_meta = payload.get("pages") or {}
|
||||
page_numbers: set[int] = set()
|
||||
if isinstance(pages_meta, dict):
|
||||
for k in pages_meta.keys():
|
||||
try:
|
||||
page_numbers.add(int(k))
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
elif isinstance(pages_meta, list):
|
||||
for p in pages_meta:
|
||||
if isinstance(p, dict):
|
||||
pn = p.get("page_no") or p.get("page") or p.get("number")
|
||||
if isinstance(pn, int):
|
||||
page_numbers.add(pn)
|
||||
|
||||
for b in blocks:
|
||||
page_numbers.add(b.page_number)
|
||||
for t in tables:
|
||||
page_numbers.add(t.page_number)
|
||||
for f in figures:
|
||||
page_numbers.add(f.page_number)
|
||||
page_numbers.discard(0)
|
||||
if not page_numbers:
|
||||
page_numbers = {1}
|
||||
|
||||
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
|
||||
for b in blocks:
|
||||
by_page_text.setdefault(b.page_number, []).append(b.text)
|
||||
|
||||
has_tables_set = {t.page_number for t in tables}
|
||||
has_figures_set = {f.page_number for f in figures}
|
||||
|
||||
return [
|
||||
ExtractedPage(
|
||||
page_number=pn,
|
||||
text="\n\n".join(by_page_text.get(pn, [])),
|
||||
has_tables=pn in has_tables_set,
|
||||
has_figures=pn in has_figures_set,
|
||||
)
|
||||
for pn in sorted(page_numbers)
|
||||
]
|
||||
|
||||
|
||||
def _page_of(item: dict[str, Any]) -> int:
|
||||
prov = item.get("prov") or item.get("provenance")
|
||||
if isinstance(prov, list) and prov:
|
||||
first = prov[0]
|
||||
if isinstance(first, dict):
|
||||
pn = first.get("page_no") or first.get("page") or first.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
pn = item.get("page_no") or item.get("page") or item.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
return 1
|
||||
|
||||
|
||||
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
|
||||
# Try Docling's own export first (per-table).
|
||||
try:
|
||||
export = getattr(doc, "export_table_to_markdown", None)
|
||||
if callable(export):
|
||||
return export(idx)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
|
||||
if isinstance(grid, list) and grid and isinstance(grid[0], list):
|
||||
return _grid_to_markdown(grid)
|
||||
cells = raw.get("table_cells")
|
||||
if isinstance(cells, list):
|
||||
return _cells_to_markdown(cells)
|
||||
return ""
|
||||
|
||||
|
||||
def _grid_to_markdown(grid: list[list[Any]]) -> str:
|
||||
if not grid:
|
||||
return ""
|
||||
|
||||
def _cell(c: Any) -> str:
|
||||
if isinstance(c, dict):
|
||||
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
|
||||
return str(c).replace("|", "\\|").strip()
|
||||
|
||||
header = grid[0]
|
||||
body = grid[1:] if len(grid) > 1 else []
|
||||
cols = len(header)
|
||||
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
|
||||
out.append("| " + " | ".join(["---"] * cols) + " |")
|
||||
for row in body:
|
||||
cells = [_cell(c) for c in row]
|
||||
if len(cells) < cols:
|
||||
cells += [""] * (cols - len(cells))
|
||||
out.append("| " + " | ".join(cells[:cols]) + " |")
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _cells_to_markdown(cells: list[Any]) -> str:
|
||||
rows: dict[int, dict[int, str]] = {}
|
||||
for c in cells:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
|
||||
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
|
||||
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
|
||||
if not rows:
|
||||
return ""
|
||||
max_col = max((max(r.keys()) for r in rows.values()), default=0)
|
||||
grid = []
|
||||
for r_idx in sorted(rows):
|
||||
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
|
||||
grid.append(row)
|
||||
return _grid_to_markdown(grid)
|
||||
|
||||
|
||||
def _table_csv(raw: dict[str, Any]) -> str | None:
|
||||
grid = raw.get("data") or raw.get("grid")
|
||||
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
|
||||
return None
|
||||
import csv
|
||||
import io
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf)
|
||||
for row in grid:
|
||||
writer.writerow([
|
||||
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
|
||||
])
|
||||
return buf.getvalue()
|
||||
Reference in New Issue
Block a user