chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

View File

317
app/ingestion/chunker.py Normal file
View File

@@ -0,0 +1,317 @@
"""Structure-aware chunking.
Rules (per spec):
- Chunk by document structure first, fixed-size second.
- Hierarchy: title > heading > paragraph > list > table > figure caption.
- Target 500-900 tokens (configurable).
- Overlap 80-120 tokens for long narrative text only.
- Never split tables - one table = one chunk (or one chunk per row group if huge).
- Every chunk carries citation metadata.
We use a deliberately simple ``len(text.split())`` token estimator. The downstream
embedding model has its own tokenizer; this estimator is only a budget proxy.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from app.config import settings
from app.ingestion.docling_extractor import (
ExtractedBlock,
ExtractedFigure,
ExtractedTable,
ExtractionResult,
)
from app.ingestion.normalizer import normalize_block
from app.ingestion.quality import compute_quality_flags
@dataclass
class ChunkRecord:
chunk_index: int
page_number: int
block_type: str
text: str
normalized_text: str
token_count: int
block_id: str | None = None
quality_flags: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict)
def _estimate_tokens(text: str) -> int:
return max(1, len(text.split()))
def chunk_extraction(
extraction: ExtractionResult,
*,
document_ocr_confidence: float | None = None,
) -> list[ChunkRecord]:
target = settings.chunk_target_tokens
minimum = settings.chunk_min_tokens
maximum = settings.chunk_max_tokens
overlap = settings.chunk_overlap_tokens
chunks: list[ChunkRecord] = []
idx = 0
# 1) Tables first - one chunk per table, never split.
for t in extraction.tables:
body = (t.markdown or "").strip()
if not body:
continue
summary = _summarize_table(t)
text = body
if summary:
text = f"{summary}\n\n{body}"
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type="table",
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=t.page_number,
block_type="table",
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=t.block_id or f"table:{t.table_index}",
quality_flags=flags,
metadata={"table_index": t.table_index, "summary": summary or ""},
)
)
idx += 1
# 2) Figures - caption + placeholder description.
for f in extraction.figures:
text_parts: list[str] = []
if f.caption:
text_parts.append(f"Caption: {f.caption}")
text_parts.append(f"Figure detected on page {f.page_number}.")
text = "\n".join(text_parts)
block_type = "figure_caption" if f.caption else "figure_description"
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type=block_type,
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=f.page_number,
block_type=block_type,
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=f.block_id or f"figure:{f.figure_index}",
quality_flags=flags,
metadata={"figure_index": f.figure_index},
)
)
idx += 1
# 3) Narrative blocks grouped per page, packed by structure.
by_page: dict[int, list[ExtractedBlock]] = {}
for b in extraction.blocks:
by_page.setdefault(b.page_number, []).append(b)
for page_no in sorted(by_page):
blocks = by_page[page_no]
groups = _group_by_section(blocks)
for group in groups:
packed = _pack_group(group, target=target, maximum=maximum, minimum=minimum)
for piece in packed:
text = piece["text"]
btype = piece["block_type"]
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type=btype,
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=page_no,
block_type=btype,
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=piece.get("block_id"),
quality_flags=flags,
metadata={"section_heading": piece.get("section") or ""},
)
)
idx += 1
# Optional overlap: only if the last piece is long narrative
if overlap > 0 and packed and packed[-1]["block_type"] == "paragraph":
tail = _tail_tokens(packed[-1]["text"], overlap)
if tail and len(tail.split()) >= max(20, overlap // 2):
# Overlap is already represented by next-group adjacency in
# most legacy docs; we do not emit duplicate overlap chunks
# to avoid index bloat. This is intentional per spec note
# ("only for long narrative text") - left here for future tuning.
pass
return chunks
# ---------------- Helpers ----------------
def _group_by_section(blocks: list[ExtractedBlock]) -> list[list[ExtractedBlock]]:
groups: list[list[ExtractedBlock]] = []
current: list[ExtractedBlock] = []
for b in blocks:
if b.block_type in ("title", "heading") and current:
groups.append(current)
current = [b]
else:
current.append(b)
if current:
groups.append(current)
return groups
def _pack_group(
group: list[ExtractedBlock], *, target: int, maximum: int, minimum: int
) -> list[dict[str, Any]]:
"""Pack a section's blocks into chunks at most ``maximum`` tokens.
Headings / titles attach to the next chunk as a section anchor.
"""
if not group:
return []
section_heading = ""
body_blocks: list[ExtractedBlock] = []
for b in group:
if b.block_type in ("title", "heading"):
section_heading = (section_heading + " > " + b.text).strip(" >") if section_heading else b.text
else:
body_blocks.append(b)
if not body_blocks:
# Heading-only group: emit as a single ``heading`` chunk so the title is searchable.
text = section_heading or group[0].text
return [
{
"text": text,
"block_type": "heading",
"block_id": group[0].block_id,
"section": section_heading,
}
]
out: list[dict[str, Any]] = []
buffer: list[str] = []
buffer_block_ids: list[str] = []
buffer_block_type = "paragraph"
buffer_tokens = 0
def flush():
nonlocal buffer, buffer_block_ids, buffer_block_type, buffer_tokens
if not buffer:
return
text = "\n\n".join(buffer).strip()
if not text:
buffer = []
buffer_block_ids = []
buffer_tokens = 0
return
# Prepend section heading for context (kept short).
if section_heading and len(section_heading) < 200:
text = f"# {section_heading}\n\n{text}"
out.append(
{
"text": text,
"block_type": buffer_block_type,
"block_id": buffer_block_ids[0] if buffer_block_ids else None,
"section": section_heading,
}
)
buffer = []
buffer_block_ids = []
buffer_tokens = 0
for b in body_blocks:
tokens = _estimate_tokens(b.text)
if tokens >= maximum:
# Hard split a giant block into sub-chunks of ~target tokens.
flush()
for sub in _split_long_text(b.text, target=target, maximum=maximum):
out.append(
{
"text": sub,
"block_type": b.block_type if b.block_type != "list" else "list",
"block_id": b.block_id,
"section": section_heading,
}
)
continue
if buffer_tokens + tokens > maximum and buffer_tokens >= minimum:
flush()
if not buffer:
buffer_block_type = b.block_type if b.block_type != "list" else "list"
buffer.append(b.text)
if b.block_id:
buffer_block_ids.append(b.block_id)
buffer_tokens += tokens
if buffer_tokens >= target:
flush()
flush()
return out
def _split_long_text(text: str, *, target: int, maximum: int) -> list[str]:
words = text.split()
if not words:
return []
pieces: list[str] = []
step = target
if step <= 0:
step = 500
i = 0
while i < len(words):
end = min(len(words), i + maximum)
# Aim for ``target`` words but extend up to ``maximum`` to reach a sentence boundary.
piece = " ".join(words[i : i + step])
pieces.append(piece)
i += step
if end - i < target // 4 and end - i > 0:
pieces[-1] = " ".join(words[i - step : end])
break
return pieces
def _tail_tokens(text: str, n: int) -> str:
words = text.split()
if len(words) <= n:
return text
return " ".join(words[-n:])
def _summarize_table(t: ExtractedTable) -> str:
"""Heuristic one-line summary for index recall."""
md = t.markdown or ""
first = next((line for line in md.splitlines() if line.startswith("|")), "")
header_cells = [c.strip() for c in first.strip("|").split("|") if c.strip()]
n_cols = len(header_cells)
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
header_preview = ", ".join(header_cells[:6])
return (
f"Table on page {t.page_number}: {n_rows} rows x {n_cols} cols. "
f"Columns: {header_preview}." if header_cells else
f"Table on page {t.page_number}."
)

View File

@@ -0,0 +1,384 @@
"""Docling structured extraction.
Docling produces a hierarchical document model with reading order, layout, tables
and figures. We export both Markdown and a JSON representation, then walk the
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
figure caption) for downstream chunking.
The extractor is intentionally defensive: Docling's exact Python API has
shifted across releases. We probe for the safest exporter methods and fall
back to ``str(document)`` only as a last resort.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
@dataclass
class ExtractedBlock:
page_number: int
block_type: str
text: str
block_id: str | None = None
extra: dict[str, Any] = field(default_factory=dict)
@dataclass
class ExtractedTable:
page_number: int
table_index: int
markdown: str
csv_text: str | None = None
json_data: dict[str, Any] | None = None
block_id: str | None = None
@dataclass
class ExtractedFigure:
page_number: int
figure_index: int
caption: str | None
block_id: str | None = None
image_bytes: bytes | None = None
image_ext: str = "png"
@dataclass
class ExtractedPage:
page_number: int
text: str
has_tables: bool = False
has_figures: bool = False
has_handwriting: bool = False
ocr_confidence: float | None = None
@dataclass
class ExtractionResult:
markdown: str
json_payload: dict[str, Any]
blocks: list[ExtractedBlock]
tables: list[ExtractedTable]
figures: list[ExtractedFigure]
pages: list[ExtractedPage]
def extract(pdf_path: Path) -> ExtractionResult:
"""Run Docling on ``pdf_path`` and return a normalized result."""
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
pipeline_options = PdfPipelineOptions()
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
pipeline_options.do_ocr = settings.docling_ocr_enabled
pipeline_options.do_table_structure = True
try:
pipeline_options.table_structure_options.do_cell_matching = True
except Exception: # noqa: BLE001 - older docling versions lack this
pass
try:
pipeline_options.generate_page_images = True
except Exception: # noqa: BLE001
pass
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
logger.info("docling.start", input=str(pdf_path))
conv = converter.convert(str(pdf_path))
doc = conv.document
markdown = _safe_export_markdown(doc)
json_payload = _safe_export_dict(doc)
blocks = _walk_blocks(json_payload)
tables = _walk_tables(doc, json_payload)
figures = _walk_figures(doc, json_payload)
pages = _walk_pages(json_payload, blocks, tables, figures)
logger.info(
"docling.done",
pages=len(pages),
blocks=len(blocks),
tables=len(tables),
figures=len(figures),
)
return ExtractionResult(
markdown=markdown,
json_payload=json_payload,
blocks=blocks,
tables=tables,
figures=figures,
pages=pages,
)
# ---------------- Internal helpers ----------------
def _safe_export_markdown(doc: Any) -> str:
for attr in ("export_to_markdown", "to_markdown"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
return fn()
except Exception: # noqa: BLE001
continue
return str(doc)
def _safe_export_dict(doc: Any) -> dict[str, Any]:
for attr in ("export_to_dict", "model_dump", "dict"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
data = fn()
if isinstance(data, dict):
return data
except Exception: # noqa: BLE001
continue
# Last resort: serialize via JSON round-trip
try:
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
except Exception: # noqa: BLE001
return {}
_DOCLING_LABEL_TO_BLOCK = {
"title": "title",
"section_header": "heading",
"section-header": "heading",
"subtitle": "heading",
"page_header": "heading",
"header": "heading",
"list_item": "list",
"list-item": "list",
"list": "list",
"paragraph": "paragraph",
"text": "paragraph",
"caption": "figure_caption",
"figure": "figure_caption",
"table": "table",
"footnote": "paragraph",
}
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
"""Flatten Docling's text items into ordered blocks per page."""
blocks: list[ExtractedBlock] = []
items = (
payload.get("texts")
or payload.get("text_items")
or payload.get("body", {}).get("text_items", [])
or []
)
if not isinstance(items, list):
return blocks
for item in items:
if not isinstance(item, dict):
continue
label = (item.get("label") or item.get("category") or "paragraph").lower()
text = (item.get("text") or "").strip()
if not text:
continue
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
page = _page_of(item)
blocks.append(
ExtractedBlock(
page_number=page,
block_type=block_type,
text=text,
block_id=item.get("self_ref") or item.get("id"),
extra={"label": label},
)
)
return blocks
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
tables: list[ExtractedTable] = []
raw_tables = payload.get("tables") or []
for idx, t in enumerate(raw_tables):
if not isinstance(t, dict):
continue
page = _page_of(t)
md = _table_markdown(doc, t, idx)
csv_text = _table_csv(t)
tables.append(
ExtractedTable(
page_number=page,
table_index=idx,
markdown=md,
csv_text=csv_text,
json_data=t,
block_id=t.get("self_ref") or t.get("id"),
)
)
return tables
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
figures: list[ExtractedFigure] = []
raw_figures = payload.get("pictures") or payload.get("figures") or []
for idx, f in enumerate(raw_figures):
if not isinstance(f, dict):
continue
page = _page_of(f)
caption = (f.get("caption") or "").strip() or None
figures.append(
ExtractedFigure(
page_number=page,
figure_index=idx,
caption=caption,
block_id=f.get("self_ref") or f.get("id"),
)
)
return figures
def _walk_pages(
payload: dict[str, Any],
blocks: list[ExtractedBlock],
tables: list[ExtractedTable],
figures: list[ExtractedFigure],
) -> list[ExtractedPage]:
pages_meta = payload.get("pages") or {}
page_numbers: set[int] = set()
if isinstance(pages_meta, dict):
for k in pages_meta.keys():
try:
page_numbers.add(int(k))
except (ValueError, TypeError):
continue
elif isinstance(pages_meta, list):
for p in pages_meta:
if isinstance(p, dict):
pn = p.get("page_no") or p.get("page") or p.get("number")
if isinstance(pn, int):
page_numbers.add(pn)
for b in blocks:
page_numbers.add(b.page_number)
for t in tables:
page_numbers.add(t.page_number)
for f in figures:
page_numbers.add(f.page_number)
page_numbers.discard(0)
if not page_numbers:
page_numbers = {1}
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
for b in blocks:
by_page_text.setdefault(b.page_number, []).append(b.text)
has_tables_set = {t.page_number for t in tables}
has_figures_set = {f.page_number for f in figures}
return [
ExtractedPage(
page_number=pn,
text="\n\n".join(by_page_text.get(pn, [])),
has_tables=pn in has_tables_set,
has_figures=pn in has_figures_set,
)
for pn in sorted(page_numbers)
]
def _page_of(item: dict[str, Any]) -> int:
prov = item.get("prov") or item.get("provenance")
if isinstance(prov, list) and prov:
first = prov[0]
if isinstance(first, dict):
pn = first.get("page_no") or first.get("page") or first.get("page_number")
if isinstance(pn, int):
return pn
pn = item.get("page_no") or item.get("page") or item.get("page_number")
if isinstance(pn, int):
return pn
return 1
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
# Try Docling's own export first (per-table).
try:
export = getattr(doc, "export_table_to_markdown", None)
if callable(export):
return export(idx)
except Exception: # noqa: BLE001
pass
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
if isinstance(grid, list) and grid and isinstance(grid[0], list):
return _grid_to_markdown(grid)
cells = raw.get("table_cells")
if isinstance(cells, list):
return _cells_to_markdown(cells)
return ""
def _grid_to_markdown(grid: list[list[Any]]) -> str:
if not grid:
return ""
def _cell(c: Any) -> str:
if isinstance(c, dict):
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
return str(c).replace("|", "\\|").strip()
header = grid[0]
body = grid[1:] if len(grid) > 1 else []
cols = len(header)
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
out.append("| " + " | ".join(["---"] * cols) + " |")
for row in body:
cells = [_cell(c) for c in row]
if len(cells) < cols:
cells += [""] * (cols - len(cells))
out.append("| " + " | ".join(cells[:cols]) + " |")
return "\n".join(out)
def _cells_to_markdown(cells: list[Any]) -> str:
rows: dict[int, dict[int, str]] = {}
for c in cells:
if not isinstance(c, dict):
continue
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
if not rows:
return ""
max_col = max((max(r.keys()) for r in rows.values()), default=0)
grid = []
for r_idx in sorted(rows):
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
grid.append(row)
return _grid_to_markdown(grid)
def _table_csv(raw: dict[str, Any]) -> str | None:
grid = raw.get("data") or raw.get("grid")
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
return None
import csv
import io
buf = io.StringIO()
writer = csv.writer(buf)
for row in grid:
writer.writerow([
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
])
return buf.getvalue()

View File

@@ -0,0 +1,78 @@
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
from __future__ import annotations
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Figure
from app.ingestion.docling_extractor import ExtractedFigure
from app.logging_config import get_logger
from app.storage.local_paths import key_figure_crop
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_figures(
db,
storage: MinioStorage,
document_id: uuid.UUID,
figures: list[ExtractedFigure],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for f in figures:
existing = db.execute(
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
).scalar_one_or_none()
if existing is None:
existing = Figure(
document_id=document_id,
page_id=page_id_by_number.get(f.page_number),
page_number=f.page_number,
figure_index=f.figure_index,
)
db.add(existing)
existing.caption = f.caption
existing.description = (
f"Figure detected on page {f.page_number}." if not f.caption else
f"Figure on page {f.page_number}. Caption: {f.caption}"
)
if f.image_bytes:
key = key_figure_crop(document_id, f.page_number, f.figure_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=f.image_bytes,
content_type=f"image/{f.image_ext}",
)
existing.storage_bucket = storage.derived_bucket
existing.storage_key = key
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
count += 1
return count
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)

View File

@@ -0,0 +1,12 @@
"""Block-level normalization wrappers around utils.text_cleaning."""
from __future__ import annotations
from app.utils.text_cleaning import clean_ocr_text, normalize_for_search
def normalize_block(text: str) -> tuple[str, str]:
"""Return ``(display_text, normalized_text)``."""
display = clean_ocr_text(text)
norm = normalize_for_search(display)
return display, norm

87
app/ingestion/ocr.py Normal file
View File

@@ -0,0 +1,87 @@
"""OCRmyPDF integration with Tesseract.
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
disabled by config), we skip OCR and use the original PDF. On failure, the
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import ocrmypdf
from app.config import settings
from app.logging_config import get_logger
from app.utils.pdf import has_searchable_text
logger = get_logger(__name__)
@dataclass
class OcrResult:
output_path: Path
skipped: bool
reason: str
languages: str
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
"""Run OCRmyPDF.
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
- If the input already has searchable text: skip OCR but still produce
``output_pdf`` (a hard-link / copy to keep downstream code simple).
- On unexpected exceptions: re-raise (caller handles status update).
"""
langs = languages or settings.ocr_languages
if not settings.ocr_enabled:
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
if has_searchable_text(input_pdf):
return _skip(input_pdf, output_pdf, langs, "already_searchable")
output_pdf.parent.mkdir(parents=True, exist_ok=True)
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
try:
ocrmypdf.ocr(
input_file=str(input_pdf),
output_file=str(output_pdf),
language=langs,
skip_text=False,
redo_ocr=False,
force_ocr=False,
deskew=settings.ocr_deskew,
clean=settings.ocr_clean,
optimize=settings.ocr_optimize,
progress_bar=False,
jobs=1,
output_type="pdf",
# tolerate already-OCR pages where present
skip_big=200.0,
)
except ocrmypdf.exceptions.PriorOcrFoundError:
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
except ocrmypdf.exceptions.DigitalSignatureError:
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
except ocrmypdf.exceptions.EncryptedPdfError as exc:
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
raise
except ocrmypdf.exceptions.MissingDependencyError as exc:
logger.error("ocr.missing_dependency", error=str(exc))
raise
logger.info("ocr.done", output=str(output_pdf))
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
output_pdf.parent.mkdir(parents=True, exist_ok=True)
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
output_pdf.write_bytes(input_pdf.read_bytes())
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)

384
app/ingestion/pipeline.py Normal file
View File

@@ -0,0 +1,384 @@
"""Per-document end-to-end pipeline: OCR -> Docling -> chunk -> persist -> index.
Called by the Celery worker. Idempotent: re-running on the same document deletes
existing chunks for that document and re-creates them, then re-indexes in
OpenSearch and Qdrant.
"""
from __future__ import annotations
import json
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from sqlalchemy import delete, select
from app.config import settings
from app.db.models import (
ArtifactType,
Chunk,
Document,
DocumentArtifact,
DocumentStatus,
Page,
ProcessingEvent,
)
from app.db.session import session_scope
from app.indexing import opensearch_client, qdrant_client
from app.indexing.embeddings import get_embedder
from app.ingestion.chunker import ChunkRecord, chunk_extraction
from app.ingestion.docling_extractor import ExtractionResult, extract
from app.ingestion.figure_processor import persist_figures
from app.ingestion.ocr import run_ocr
from app.ingestion.table_processor import persist_tables
from app.logging_config import get_logger
from app.storage.local_paths import (
key_docling_json,
key_markdown,
key_ocr_pdf,
work_dir_for,
)
from app.storage.minio_client import get_storage
from app.utils.language import detect_language
logger = get_logger(__name__)
def process_document_id(document_id: uuid.UUID, run_id: uuid.UUID | None = None) -> dict[str, Any]:
"""Top-level entry called by the Celery task. Wraps the pipeline in
error handling so the task always either succeeds or marks the document FAILED.
"""
storage = get_storage()
storage.ensure_buckets()
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is None:
logger.warning("pipeline.document_missing", document_id=str(document_id))
return {"status": "missing"}
source_path = Path(doc.source_path)
sha = doc.sha256
original_artifact = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == doc.id,
DocumentArtifact.artifact_type == ArtifactType.ORIGINAL_PDF,
)
).scalar_one_or_none()
work_dir = work_dir_for(document_id)
local_pdf = work_dir / f"{sha}.pdf"
if not local_pdf.exists():
if source_path.exists():
local_pdf.write_bytes(source_path.read_bytes())
elif original_artifact:
storage.get_to_path(original_artifact.storage_bucket, original_artifact.storage_key, local_pdf)
else:
return _fail(document_id, run_id, "OCR_FAILED", "Original PDF not available locally or in MinIO")
# ---------------- OCR ----------------
ocr_pdf = work_dir / "ocr.pdf"
try:
_emit_event(document_id, run_id, DocumentStatus.OCR_STARTED, "OCR started")
ocr_result = run_ocr(local_pdf, ocr_pdf, languages=settings.ocr_languages)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.ocr_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.OCR_FAILED, f"OCR failed: {exc}")
# Upload OCR PDF (even if we 'skipped' it - OCR PDF is the canonical input to Docling).
ocr_key = key_ocr_pdf(document_id)
storage.put_file(
bucket=storage.derived_bucket,
key=ocr_key,
path=ocr_result.output_path,
content_type="application/pdf",
)
with session_scope() as db:
_ensure_artifact(db, document_id, ArtifactType.OCR_PDF, storage.derived_bucket, ocr_key)
doc = db.get(Document, document_id)
if doc is not None:
doc.status = DocumentStatus.OCR_COMPLETED
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.OCR_COMPLETED,
level="INFO",
message=f"OCR finished ({ocr_result.reason})",
data={"skipped": ocr_result.skipped, "languages": ocr_result.languages},
)
)
# ---------------- Docling ----------------
try:
_emit_event(document_id, run_id, DocumentStatus.EXTRACTION_STARTED, "Docling extraction started")
extraction = extract(ocr_result.output_path)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.docling_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.EXTRACTION_FAILED, f"Docling failed: {exc}")
# Persist Markdown + JSON to MinIO.
md_key = key_markdown(document_id)
json_key = key_docling_json(document_id)
storage.put_bytes(
bucket=storage.derived_bucket,
key=md_key,
data=extraction.markdown.encode("utf-8"),
content_type="text/markdown",
)
storage.put_bytes(
bucket=storage.derived_bucket,
key=json_key,
data=json.dumps(extraction.json_payload, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
# ---------------- Persist pages, chunks, tables, figures ----------------
chunk_records = chunk_extraction(extraction)
sample_text = "\n".join(p.text for p in extraction.pages[:3] if p.text)
lang = detect_language(sample_text)
with session_scope() as db:
_ensure_artifact(db, document_id, ArtifactType.MARKDOWN, storage.derived_bucket, md_key)
_ensure_artifact(db, document_id, ArtifactType.DOCLING_JSON, storage.derived_bucket, json_key)
doc = db.get(Document, document_id)
if doc is None:
return {"status": "missing"}
doc.status = DocumentStatus.EXTRACTION_COMPLETED
if lang and not doc.language_hint:
doc.language_hint = lang
page_id_by_number = _upsert_pages(db, document_id, extraction)
persist_tables(db, storage, document_id, extraction.tables, page_id_by_number)
persist_figures(db, storage, document_id, extraction.figures, page_id_by_number)
# Replace chunks idempotently: drop all and re-insert.
db.execute(delete(Chunk).where(Chunk.document_id == document_id))
for cr in chunk_records:
db.add(_to_chunk_row(document_id, page_id_by_number, cr))
doc.status = DocumentStatus.CHUNKING_COMPLETED
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.CHUNKING_COMPLETED,
level="INFO",
message="Chunking complete",
data={"chunks": len(chunk_records)},
)
)
# ---------------- Indexing (OpenSearch + Qdrant) ----------------
try:
opensearch_client.ensure_index()
qdrant_client.ensure_collection()
opensearch_client.delete_by_document(str(document_id))
qdrant_client.delete_by_document(str(document_id))
os_docs, qdrant_points = _build_index_payloads(document_id, chunk_records, extraction, lang)
if os_docs:
opensearch_client.index_chunks(os_docs)
if qdrant_points:
embedder = get_embedder()
texts_to_embed = [text for _, text, _ in qdrant_points]
vectors = embedder.encode(texts_to_embed)
triples = [
(chunk_id, vec, payload)
for (chunk_id, _text, payload), vec in zip(qdrant_points, vectors, strict=True)
]
qdrant_client.upsert_chunks(triples)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.indexing_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.FAILED, f"Indexing failed: {exc}")
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is not None:
doc.status = DocumentStatus.INDEXING_COMPLETED
doc.error_message = None
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.INDEXING_COMPLETED,
level="INFO",
message="Indexing complete",
data={"chunks": len(chunk_records)},
)
)
return {"status": DocumentStatus.INDEXING_COMPLETED, "chunks": len(chunk_records)}
# ---------------- helpers ----------------
def _to_chunk_row(
document_id: uuid.UUID, page_id_by_number: dict[int, uuid.UUID], cr: ChunkRecord
) -> Chunk:
return Chunk(
document_id=document_id,
page_id=page_id_by_number.get(cr.page_number),
page_number=cr.page_number,
block_id=cr.block_id,
chunk_index=cr.chunk_index,
block_type=cr.block_type,
text=cr.text,
normalized_text=cr.normalized_text,
token_count=cr.token_count,
ocr_confidence=None,
quality_flags=cr.quality_flags,
chunk_metadata=cr.metadata,
)
def _upsert_pages(db, document_id: uuid.UUID, extraction: ExtractionResult) -> dict[int, uuid.UUID]:
existing = {
p.page_number: p
for p in db.execute(select(Page).where(Page.document_id == document_id)).scalars()
}
out: dict[int, uuid.UUID] = {}
for ep in extraction.pages:
page = existing.get(ep.page_number)
if page is None:
page = Page(
document_id=document_id,
page_number=ep.page_number,
text=ep.text,
ocr_confidence=ep.ocr_confidence,
has_tables=ep.has_tables,
has_figures=ep.has_figures,
has_handwriting=ep.has_handwriting,
)
db.add(page)
db.flush()
else:
page.text = ep.text
page.has_tables = ep.has_tables
page.has_figures = ep.has_figures
page.has_handwriting = ep.has_handwriting
out[ep.page_number] = page.id
return out
def _build_index_payloads(
document_id: uuid.UUID,
chunks: list[ChunkRecord],
extraction: ExtractionResult,
language_hint: str | None,
) -> tuple[list[dict[str, Any]], list[tuple[str, str, dict[str, Any]]]]:
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is None:
return [], []
original_file_name = doc.original_file_name
source_path = doc.source_path
chunk_rows = (
db.execute(select(Chunk).where(Chunk.document_id == document_id))
.scalars()
.all()
)
os_docs: list[dict[str, Any]] = []
qdrant: list[tuple[str, str, dict[str, Any]]] = []
for row in chunk_rows:
chunk_id = str(row.id)
text = row.text or ""
os_docs.append(
{
"chunk_id": chunk_id,
"document_id": str(document_id),
"source_path": source_path,
"original_file_name": original_file_name,
"page_number": row.page_number,
"block_type": row.block_type,
"block_id": row.block_id,
"text": text,
"normalized_text": row.normalized_text,
"ocr_confidence": row.ocr_confidence,
"language_hint": language_hint,
"metadata": row.chunk_metadata or {},
"quality_flags": row.quality_flags or {},
"created_at": (row.created_at or datetime.now(tz=timezone.utc)).isoformat(),
}
)
text_preview = text[:512]
qdrant.append(
(
chunk_id,
text,
{
"document_id": str(document_id),
"source_path": source_path,
"original_file_name": original_file_name,
"page_number": row.page_number,
"block_type": row.block_type,
"block_id": row.block_id,
"text_preview": text_preview,
"ocr_confidence": row.ocr_confidence,
"quality_flags": row.quality_flags or {},
"metadata": row.chunk_metadata or {},
},
)
)
return os_docs, qdrant
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
)
)
def _emit_event(document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str) -> None:
with session_scope() as db:
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=stage,
level="INFO",
message=message,
data={},
)
)
def _fail(
document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str
) -> dict[str, Any]:
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is not None:
doc.status = stage
doc.error_message = message[:2000]
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=stage,
level="ERROR",
message=message,
data={},
)
)
logger.error("pipeline.failed", document_id=str(document_id), stage=stage, message=message)
return {"status": stage, "error": message}

41
app/ingestion/quality.py Normal file
View File

@@ -0,0 +1,41 @@
"""Quality flag computation for chunks."""
from __future__ import annotations
from typing import Any
from app.utils.text_cleaning import looks_garbled
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
SHORT_TEXT_THRESHOLD = 24
def compute_quality_flags(
*,
text: str,
block_type: str,
ocr_confidence: float | None,
has_handwriting: bool = False,
) -> dict[str, Any]:
flags: dict[str, Any] = {
"low_ocr_confidence": False,
"very_short_text": False,
"possible_garbled_text": False,
"table_detected": block_type == "table",
"figure_detected": block_type in ("figure_caption", "figure_description"),
"handwriting_detected": has_handwriting or block_type == "handwriting",
"needs_manual_review": False,
}
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
flags["low_ocr_confidence"] = True
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
flags["very_short_text"] = True
if looks_garbled(text):
flags["possible_garbled_text"] = True
if (
flags["low_ocr_confidence"]
or flags["possible_garbled_text"]
or flags["handwriting_detected"]
):
flags["needs_manual_review"] = True
return flags

184
app/ingestion/scanner.py Normal file
View File

@@ -0,0 +1,184 @@
"""Folder scanner: discovers PDFs, deduplicates by SHA256, persists discovery rows.
The scanner does NOT trigger OCR or extraction. It only:
- enumerates PDF files,
- hashes each file,
- creates / reuses a ``Document`` row,
- uploads the original PDF to MinIO,
- emits ``DISCOVERED`` / ``STORED_ORIGINAL`` events.
Heavy work (OCR, Docling, indexing) is performed by the Celery worker pipeline.
"""
from __future__ import annotations
import os
import uuid
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from sqlalchemy import select
from app.db.models import (
ArtifactType,
Document,
DocumentArtifact,
DocumentStatus,
ProcessingEvent,
)
from app.db.session import session_scope
from app.logging_config import get_logger
from app.storage.local_paths import key_original_pdf
from app.storage.minio_client import get_storage
from app.utils.hashing import sha256_file
from app.utils.pdf import is_pdf
logger = get_logger(__name__)
@dataclass
class DiscoveryRecord:
path: Path
sha256: str | None
document_id: uuid.UUID | None
duplicate: bool
invalid: bool = False
def iter_pdf_files(root: Path, recursive: bool = True) -> Iterator[Path]:
if root.is_file():
if is_pdf(root):
yield root
return
if recursive:
for dirpath, _dirnames, filenames in os.walk(root):
for name in filenames:
p = Path(dirpath) / name
if is_pdf(p):
yield p
else:
for p in root.iterdir():
if is_pdf(p):
yield p
def discover_documents(
root: Path, recursive: bool = True, force: bool = False
) -> Iterator[DiscoveryRecord]:
storage = get_storage()
storage.ensure_buckets()
for path in iter_pdf_files(root, recursive=recursive):
try:
stat = path.stat()
sha = sha256_file(path)
except Exception as exc: # noqa: BLE001
logger.warning("scan.invalid_file", path=str(path), error=str(exc))
yield DiscoveryRecord(path=path, sha256=None, document_id=None, duplicate=False, invalid=True)
continue
with session_scope() as db:
existing = db.execute(
select(Document).where(Document.sha256 == sha)
).scalar_one_or_none()
if existing and not force:
logger.debug("scan.duplicate", path=str(path), sha256=sha, document_id=str(existing.id))
yield DiscoveryRecord(path=path, sha256=sha, document_id=existing.id, duplicate=True)
continue
doc = existing or Document(
id=uuid.uuid4(),
source_path=str(path),
original_file_name=path.name,
sha256=sha,
file_size_bytes=stat.st_size,
mime_type="application/pdf",
status=DocumentStatus.DISCOVERED,
)
if not existing:
db.add(doc)
db.flush()
db.add(
ProcessingEvent(
document_id=doc.id,
stage=DocumentStatus.DISCOVERED,
level="INFO",
message="Document discovered",
data={"sha256": sha, "size": stat.st_size, "path": str(path)},
)
)
# Upload original (idempotent) and record artifact if missing.
key = key_original_pdf(doc.id, sha)
try:
if not storage.exists(storage.originals_bucket, key):
storage.put_file(
bucket=storage.originals_bucket,
key=key,
path=path,
content_type="application/pdf",
metadata={"sha256": sha, "original-name": path.name[:255]},
)
_ensure_artifact(
db,
doc.id,
ArtifactType.ORIGINAL_PDF,
storage.originals_bucket,
key,
sha,
)
if doc.status == DocumentStatus.DISCOVERED:
doc.status = DocumentStatus.STORED_ORIGINAL
db.add(
ProcessingEvent(
document_id=doc.id,
stage=DocumentStatus.STORED_ORIGINAL,
level="INFO",
message="Original stored to MinIO",
data={"bucket": storage.originals_bucket, "key": key},
)
)
except Exception as exc: # noqa: BLE001
logger.error("scan.store_failed", path=str(path), error=str(exc))
doc.status = DocumentStatus.FAILED
doc.error_message = f"store_original: {exc}"
db.add(
ProcessingEvent(
document_id=doc.id,
stage="STORE_FAILED",
level="ERROR",
message=str(exc),
data={"path": str(path)},
)
)
yield DiscoveryRecord(path=path, sha256=sha, document_id=None, duplicate=False, invalid=True)
continue
yield DiscoveryRecord(
path=path, sha256=sha, document_id=doc.id, duplicate=bool(existing)
)
def _ensure_artifact(
db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.artifact_type == artifact_type,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
checksum=checksum,
)
)

View File

@@ -0,0 +1,84 @@
"""Persists Docling tables to PostgreSQL + MinIO."""
from __future__ import annotations
import json
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Table
from app.ingestion.docling_extractor import ExtractedTable
from app.logging_config import get_logger
from app.storage.local_paths import key_table_json
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_tables(
db,
storage: MinioStorage,
document_id: uuid.UUID,
tables: list[ExtractedTable],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for t in tables:
existing = db.execute(
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
).scalar_one_or_none()
if existing is None:
existing = Table(
document_id=document_id,
page_id=page_id_by_number.get(t.page_number),
page_number=t.page_number,
table_index=t.table_index,
)
db.add(existing)
existing.markdown = t.markdown or ""
existing.csv_text = t.csv_text
existing.json_data = t.json_data
existing.summary = _summary(t)
db.flush()
# Persist json blob to MinIO for large/inspectable copies.
if t.json_data:
key = key_table_json(document_id, t.table_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
count += 1
return count
def _summary(t: ExtractedTable) -> str:
md = t.markdown or ""
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)