chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
app/ingestion/__init__.py
Normal file
0
app/ingestion/__init__.py
Normal file
317
app/ingestion/chunker.py
Normal file
317
app/ingestion/chunker.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""Structure-aware chunking.
|
||||
|
||||
Rules (per spec):
|
||||
- Chunk by document structure first, fixed-size second.
|
||||
- Hierarchy: title > heading > paragraph > list > table > figure caption.
|
||||
- Target 500-900 tokens (configurable).
|
||||
- Overlap 80-120 tokens for long narrative text only.
|
||||
- Never split tables - one table = one chunk (or one chunk per row group if huge).
|
||||
- Every chunk carries citation metadata.
|
||||
|
||||
We use a deliberately simple ``len(text.split())`` token estimator. The downstream
|
||||
embedding model has its own tokenizer; this estimator is only a budget proxy.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
from app.ingestion.docling_extractor import (
|
||||
ExtractedBlock,
|
||||
ExtractedFigure,
|
||||
ExtractedTable,
|
||||
ExtractionResult,
|
||||
)
|
||||
from app.ingestion.normalizer import normalize_block
|
||||
from app.ingestion.quality import compute_quality_flags
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkRecord:
|
||||
chunk_index: int
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
normalized_text: str
|
||||
token_count: int
|
||||
block_id: str | None = None
|
||||
quality_flags: dict[str, Any] = field(default_factory=dict)
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _estimate_tokens(text: str) -> int:
|
||||
return max(1, len(text.split()))
|
||||
|
||||
|
||||
def chunk_extraction(
|
||||
extraction: ExtractionResult,
|
||||
*,
|
||||
document_ocr_confidence: float | None = None,
|
||||
) -> list[ChunkRecord]:
|
||||
target = settings.chunk_target_tokens
|
||||
minimum = settings.chunk_min_tokens
|
||||
maximum = settings.chunk_max_tokens
|
||||
overlap = settings.chunk_overlap_tokens
|
||||
|
||||
chunks: list[ChunkRecord] = []
|
||||
idx = 0
|
||||
|
||||
# 1) Tables first - one chunk per table, never split.
|
||||
for t in extraction.tables:
|
||||
body = (t.markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
summary = _summarize_table(t)
|
||||
text = body
|
||||
if summary:
|
||||
text = f"{summary}\n\n{body}"
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type="table",
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=t.page_number,
|
||||
block_type="table",
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=t.block_id or f"table:{t.table_index}",
|
||||
quality_flags=flags,
|
||||
metadata={"table_index": t.table_index, "summary": summary or ""},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# 2) Figures - caption + placeholder description.
|
||||
for f in extraction.figures:
|
||||
text_parts: list[str] = []
|
||||
if f.caption:
|
||||
text_parts.append(f"Caption: {f.caption}")
|
||||
text_parts.append(f"Figure detected on page {f.page_number}.")
|
||||
text = "\n".join(text_parts)
|
||||
block_type = "figure_caption" if f.caption else "figure_description"
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type=block_type,
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=f.page_number,
|
||||
block_type=block_type,
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=f.block_id or f"figure:{f.figure_index}",
|
||||
quality_flags=flags,
|
||||
metadata={"figure_index": f.figure_index},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# 3) Narrative blocks grouped per page, packed by structure.
|
||||
by_page: dict[int, list[ExtractedBlock]] = {}
|
||||
for b in extraction.blocks:
|
||||
by_page.setdefault(b.page_number, []).append(b)
|
||||
|
||||
for page_no in sorted(by_page):
|
||||
blocks = by_page[page_no]
|
||||
groups = _group_by_section(blocks)
|
||||
for group in groups:
|
||||
packed = _pack_group(group, target=target, maximum=maximum, minimum=minimum)
|
||||
for piece in packed:
|
||||
text = piece["text"]
|
||||
btype = piece["block_type"]
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type=btype,
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=page_no,
|
||||
block_type=btype,
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=piece.get("block_id"),
|
||||
quality_flags=flags,
|
||||
metadata={"section_heading": piece.get("section") or ""},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# Optional overlap: only if the last piece is long narrative
|
||||
if overlap > 0 and packed and packed[-1]["block_type"] == "paragraph":
|
||||
tail = _tail_tokens(packed[-1]["text"], overlap)
|
||||
if tail and len(tail.split()) >= max(20, overlap // 2):
|
||||
# Overlap is already represented by next-group adjacency in
|
||||
# most legacy docs; we do not emit duplicate overlap chunks
|
||||
# to avoid index bloat. This is intentional per spec note
|
||||
# ("only for long narrative text") - left here for future tuning.
|
||||
pass
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------- Helpers ----------------
|
||||
|
||||
def _group_by_section(blocks: list[ExtractedBlock]) -> list[list[ExtractedBlock]]:
|
||||
groups: list[list[ExtractedBlock]] = []
|
||||
current: list[ExtractedBlock] = []
|
||||
for b in blocks:
|
||||
if b.block_type in ("title", "heading") and current:
|
||||
groups.append(current)
|
||||
current = [b]
|
||||
else:
|
||||
current.append(b)
|
||||
if current:
|
||||
groups.append(current)
|
||||
return groups
|
||||
|
||||
|
||||
def _pack_group(
|
||||
group: list[ExtractedBlock], *, target: int, maximum: int, minimum: int
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Pack a section's blocks into chunks at most ``maximum`` tokens.
|
||||
|
||||
Headings / titles attach to the next chunk as a section anchor.
|
||||
"""
|
||||
if not group:
|
||||
return []
|
||||
|
||||
section_heading = ""
|
||||
body_blocks: list[ExtractedBlock] = []
|
||||
for b in group:
|
||||
if b.block_type in ("title", "heading"):
|
||||
section_heading = (section_heading + " > " + b.text).strip(" >") if section_heading else b.text
|
||||
else:
|
||||
body_blocks.append(b)
|
||||
|
||||
if not body_blocks:
|
||||
# Heading-only group: emit as a single ``heading`` chunk so the title is searchable.
|
||||
text = section_heading or group[0].text
|
||||
return [
|
||||
{
|
||||
"text": text,
|
||||
"block_type": "heading",
|
||||
"block_id": group[0].block_id,
|
||||
"section": section_heading,
|
||||
}
|
||||
]
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
buffer: list[str] = []
|
||||
buffer_block_ids: list[str] = []
|
||||
buffer_block_type = "paragraph"
|
||||
buffer_tokens = 0
|
||||
|
||||
def flush():
|
||||
nonlocal buffer, buffer_block_ids, buffer_block_type, buffer_tokens
|
||||
if not buffer:
|
||||
return
|
||||
text = "\n\n".join(buffer).strip()
|
||||
if not text:
|
||||
buffer = []
|
||||
buffer_block_ids = []
|
||||
buffer_tokens = 0
|
||||
return
|
||||
# Prepend section heading for context (kept short).
|
||||
if section_heading and len(section_heading) < 200:
|
||||
text = f"# {section_heading}\n\n{text}"
|
||||
out.append(
|
||||
{
|
||||
"text": text,
|
||||
"block_type": buffer_block_type,
|
||||
"block_id": buffer_block_ids[0] if buffer_block_ids else None,
|
||||
"section": section_heading,
|
||||
}
|
||||
)
|
||||
buffer = []
|
||||
buffer_block_ids = []
|
||||
buffer_tokens = 0
|
||||
|
||||
for b in body_blocks:
|
||||
tokens = _estimate_tokens(b.text)
|
||||
if tokens >= maximum:
|
||||
# Hard split a giant block into sub-chunks of ~target tokens.
|
||||
flush()
|
||||
for sub in _split_long_text(b.text, target=target, maximum=maximum):
|
||||
out.append(
|
||||
{
|
||||
"text": sub,
|
||||
"block_type": b.block_type if b.block_type != "list" else "list",
|
||||
"block_id": b.block_id,
|
||||
"section": section_heading,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
if buffer_tokens + tokens > maximum and buffer_tokens >= minimum:
|
||||
flush()
|
||||
|
||||
if not buffer:
|
||||
buffer_block_type = b.block_type if b.block_type != "list" else "list"
|
||||
buffer.append(b.text)
|
||||
if b.block_id:
|
||||
buffer_block_ids.append(b.block_id)
|
||||
buffer_tokens += tokens
|
||||
|
||||
if buffer_tokens >= target:
|
||||
flush()
|
||||
|
||||
flush()
|
||||
return out
|
||||
|
||||
|
||||
def _split_long_text(text: str, *, target: int, maximum: int) -> list[str]:
|
||||
words = text.split()
|
||||
if not words:
|
||||
return []
|
||||
pieces: list[str] = []
|
||||
step = target
|
||||
if step <= 0:
|
||||
step = 500
|
||||
i = 0
|
||||
while i < len(words):
|
||||
end = min(len(words), i + maximum)
|
||||
# Aim for ``target`` words but extend up to ``maximum`` to reach a sentence boundary.
|
||||
piece = " ".join(words[i : i + step])
|
||||
pieces.append(piece)
|
||||
i += step
|
||||
if end - i < target // 4 and end - i > 0:
|
||||
pieces[-1] = " ".join(words[i - step : end])
|
||||
break
|
||||
return pieces
|
||||
|
||||
|
||||
def _tail_tokens(text: str, n: int) -> str:
|
||||
words = text.split()
|
||||
if len(words) <= n:
|
||||
return text
|
||||
return " ".join(words[-n:])
|
||||
|
||||
|
||||
def _summarize_table(t: ExtractedTable) -> str:
|
||||
"""Heuristic one-line summary for index recall."""
|
||||
md = t.markdown or ""
|
||||
first = next((line for line in md.splitlines() if line.startswith("|")), "")
|
||||
header_cells = [c.strip() for c in first.strip("|").split("|") if c.strip()]
|
||||
n_cols = len(header_cells)
|
||||
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
||||
header_preview = ", ".join(header_cells[:6])
|
||||
return (
|
||||
f"Table on page {t.page_number}: {n_rows} rows x {n_cols} cols. "
|
||||
f"Columns: {header_preview}." if header_cells else
|
||||
f"Table on page {t.page_number}."
|
||||
)
|
||||
384
app/ingestion/docling_extractor.py
Normal file
384
app/ingestion/docling_extractor.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""Docling structured extraction.
|
||||
|
||||
Docling produces a hierarchical document model with reading order, layout, tables
|
||||
and figures. We export both Markdown and a JSON representation, then walk the
|
||||
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
|
||||
figure caption) for downstream chunking.
|
||||
|
||||
The extractor is intentionally defensive: Docling's exact Python API has
|
||||
shifted across releases. We probe for the safest exporter methods and fall
|
||||
back to ``str(document)`` only as a last resort.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedBlock:
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
block_id: str | None = None
|
||||
extra: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedTable:
|
||||
page_number: int
|
||||
table_index: int
|
||||
markdown: str
|
||||
csv_text: str | None = None
|
||||
json_data: dict[str, Any] | None = None
|
||||
block_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedFigure:
|
||||
page_number: int
|
||||
figure_index: int
|
||||
caption: str | None
|
||||
block_id: str | None = None
|
||||
image_bytes: bytes | None = None
|
||||
image_ext: str = "png"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedPage:
|
||||
page_number: int
|
||||
text: str
|
||||
has_tables: bool = False
|
||||
has_figures: bool = False
|
||||
has_handwriting: bool = False
|
||||
ocr_confidence: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
markdown: str
|
||||
json_payload: dict[str, Any]
|
||||
blocks: list[ExtractedBlock]
|
||||
tables: list[ExtractedTable]
|
||||
figures: list[ExtractedFigure]
|
||||
pages: list[ExtractedPage]
|
||||
|
||||
|
||||
def extract(pdf_path: Path) -> ExtractionResult:
|
||||
"""Run Docling on ``pdf_path`` and return a normalized result."""
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
|
||||
pipeline_options.do_ocr = settings.docling_ocr_enabled
|
||||
pipeline_options.do_table_structure = True
|
||||
try:
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
except Exception: # noqa: BLE001 - older docling versions lack this
|
||||
pass
|
||||
try:
|
||||
pipeline_options.generate_page_images = True
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
||||
)
|
||||
|
||||
logger.info("docling.start", input=str(pdf_path))
|
||||
conv = converter.convert(str(pdf_path))
|
||||
doc = conv.document
|
||||
|
||||
markdown = _safe_export_markdown(doc)
|
||||
json_payload = _safe_export_dict(doc)
|
||||
|
||||
blocks = _walk_blocks(json_payload)
|
||||
tables = _walk_tables(doc, json_payload)
|
||||
figures = _walk_figures(doc, json_payload)
|
||||
pages = _walk_pages(json_payload, blocks, tables, figures)
|
||||
|
||||
logger.info(
|
||||
"docling.done",
|
||||
pages=len(pages),
|
||||
blocks=len(blocks),
|
||||
tables=len(tables),
|
||||
figures=len(figures),
|
||||
)
|
||||
return ExtractionResult(
|
||||
markdown=markdown,
|
||||
json_payload=json_payload,
|
||||
blocks=blocks,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
pages=pages,
|
||||
)
|
||||
|
||||
|
||||
# ---------------- Internal helpers ----------------
|
||||
|
||||
def _safe_export_markdown(doc: Any) -> str:
|
||||
for attr in ("export_to_markdown", "to_markdown"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
return fn()
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
return str(doc)
|
||||
|
||||
|
||||
def _safe_export_dict(doc: Any) -> dict[str, Any]:
|
||||
for attr in ("export_to_dict", "model_dump", "dict"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
data = fn()
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
# Last resort: serialize via JSON round-trip
|
||||
try:
|
||||
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
|
||||
except Exception: # noqa: BLE001
|
||||
return {}
|
||||
|
||||
|
||||
_DOCLING_LABEL_TO_BLOCK = {
|
||||
"title": "title",
|
||||
"section_header": "heading",
|
||||
"section-header": "heading",
|
||||
"subtitle": "heading",
|
||||
"page_header": "heading",
|
||||
"header": "heading",
|
||||
"list_item": "list",
|
||||
"list-item": "list",
|
||||
"list": "list",
|
||||
"paragraph": "paragraph",
|
||||
"text": "paragraph",
|
||||
"caption": "figure_caption",
|
||||
"figure": "figure_caption",
|
||||
"table": "table",
|
||||
"footnote": "paragraph",
|
||||
}
|
||||
|
||||
|
||||
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
|
||||
"""Flatten Docling's text items into ordered blocks per page."""
|
||||
blocks: list[ExtractedBlock] = []
|
||||
items = (
|
||||
payload.get("texts")
|
||||
or payload.get("text_items")
|
||||
or payload.get("body", {}).get("text_items", [])
|
||||
or []
|
||||
)
|
||||
if not isinstance(items, list):
|
||||
return blocks
|
||||
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
label = (item.get("label") or item.get("category") or "paragraph").lower()
|
||||
text = (item.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
|
||||
page = _page_of(item)
|
||||
blocks.append(
|
||||
ExtractedBlock(
|
||||
page_number=page,
|
||||
block_type=block_type,
|
||||
text=text,
|
||||
block_id=item.get("self_ref") or item.get("id"),
|
||||
extra={"label": label},
|
||||
)
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
|
||||
tables: list[ExtractedTable] = []
|
||||
raw_tables = payload.get("tables") or []
|
||||
for idx, t in enumerate(raw_tables):
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
page = _page_of(t)
|
||||
md = _table_markdown(doc, t, idx)
|
||||
csv_text = _table_csv(t)
|
||||
tables.append(
|
||||
ExtractedTable(
|
||||
page_number=page,
|
||||
table_index=idx,
|
||||
markdown=md,
|
||||
csv_text=csv_text,
|
||||
json_data=t,
|
||||
block_id=t.get("self_ref") or t.get("id"),
|
||||
)
|
||||
)
|
||||
return tables
|
||||
|
||||
|
||||
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
|
||||
figures: list[ExtractedFigure] = []
|
||||
raw_figures = payload.get("pictures") or payload.get("figures") or []
|
||||
for idx, f in enumerate(raw_figures):
|
||||
if not isinstance(f, dict):
|
||||
continue
|
||||
page = _page_of(f)
|
||||
caption = (f.get("caption") or "").strip() or None
|
||||
figures.append(
|
||||
ExtractedFigure(
|
||||
page_number=page,
|
||||
figure_index=idx,
|
||||
caption=caption,
|
||||
block_id=f.get("self_ref") or f.get("id"),
|
||||
)
|
||||
)
|
||||
return figures
|
||||
|
||||
|
||||
def _walk_pages(
|
||||
payload: dict[str, Any],
|
||||
blocks: list[ExtractedBlock],
|
||||
tables: list[ExtractedTable],
|
||||
figures: list[ExtractedFigure],
|
||||
) -> list[ExtractedPage]:
|
||||
pages_meta = payload.get("pages") or {}
|
||||
page_numbers: set[int] = set()
|
||||
if isinstance(pages_meta, dict):
|
||||
for k in pages_meta.keys():
|
||||
try:
|
||||
page_numbers.add(int(k))
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
elif isinstance(pages_meta, list):
|
||||
for p in pages_meta:
|
||||
if isinstance(p, dict):
|
||||
pn = p.get("page_no") or p.get("page") or p.get("number")
|
||||
if isinstance(pn, int):
|
||||
page_numbers.add(pn)
|
||||
|
||||
for b in blocks:
|
||||
page_numbers.add(b.page_number)
|
||||
for t in tables:
|
||||
page_numbers.add(t.page_number)
|
||||
for f in figures:
|
||||
page_numbers.add(f.page_number)
|
||||
page_numbers.discard(0)
|
||||
if not page_numbers:
|
||||
page_numbers = {1}
|
||||
|
||||
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
|
||||
for b in blocks:
|
||||
by_page_text.setdefault(b.page_number, []).append(b.text)
|
||||
|
||||
has_tables_set = {t.page_number for t in tables}
|
||||
has_figures_set = {f.page_number for f in figures}
|
||||
|
||||
return [
|
||||
ExtractedPage(
|
||||
page_number=pn,
|
||||
text="\n\n".join(by_page_text.get(pn, [])),
|
||||
has_tables=pn in has_tables_set,
|
||||
has_figures=pn in has_figures_set,
|
||||
)
|
||||
for pn in sorted(page_numbers)
|
||||
]
|
||||
|
||||
|
||||
def _page_of(item: dict[str, Any]) -> int:
|
||||
prov = item.get("prov") or item.get("provenance")
|
||||
if isinstance(prov, list) and prov:
|
||||
first = prov[0]
|
||||
if isinstance(first, dict):
|
||||
pn = first.get("page_no") or first.get("page") or first.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
pn = item.get("page_no") or item.get("page") or item.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
return 1
|
||||
|
||||
|
||||
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
|
||||
# Try Docling's own export first (per-table).
|
||||
try:
|
||||
export = getattr(doc, "export_table_to_markdown", None)
|
||||
if callable(export):
|
||||
return export(idx)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
|
||||
if isinstance(grid, list) and grid and isinstance(grid[0], list):
|
||||
return _grid_to_markdown(grid)
|
||||
cells = raw.get("table_cells")
|
||||
if isinstance(cells, list):
|
||||
return _cells_to_markdown(cells)
|
||||
return ""
|
||||
|
||||
|
||||
def _grid_to_markdown(grid: list[list[Any]]) -> str:
|
||||
if not grid:
|
||||
return ""
|
||||
|
||||
def _cell(c: Any) -> str:
|
||||
if isinstance(c, dict):
|
||||
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
|
||||
return str(c).replace("|", "\\|").strip()
|
||||
|
||||
header = grid[0]
|
||||
body = grid[1:] if len(grid) > 1 else []
|
||||
cols = len(header)
|
||||
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
|
||||
out.append("| " + " | ".join(["---"] * cols) + " |")
|
||||
for row in body:
|
||||
cells = [_cell(c) for c in row]
|
||||
if len(cells) < cols:
|
||||
cells += [""] * (cols - len(cells))
|
||||
out.append("| " + " | ".join(cells[:cols]) + " |")
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _cells_to_markdown(cells: list[Any]) -> str:
|
||||
rows: dict[int, dict[int, str]] = {}
|
||||
for c in cells:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
|
||||
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
|
||||
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
|
||||
if not rows:
|
||||
return ""
|
||||
max_col = max((max(r.keys()) for r in rows.values()), default=0)
|
||||
grid = []
|
||||
for r_idx in sorted(rows):
|
||||
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
|
||||
grid.append(row)
|
||||
return _grid_to_markdown(grid)
|
||||
|
||||
|
||||
def _table_csv(raw: dict[str, Any]) -> str | None:
|
||||
grid = raw.get("data") or raw.get("grid")
|
||||
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
|
||||
return None
|
||||
import csv
|
||||
import io
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf)
|
||||
for row in grid:
|
||||
writer.writerow([
|
||||
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
|
||||
])
|
||||
return buf.getvalue()
|
||||
78
app/ingestion/figure_processor.py
Normal file
78
app/ingestion/figure_processor.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Figure
|
||||
from app.ingestion.docling_extractor import ExtractedFigure
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_figure_crop
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_figures(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
figures: list[ExtractedFigure],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for f in figures:
|
||||
existing = db.execute(
|
||||
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Figure(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(f.page_number),
|
||||
page_number=f.page_number,
|
||||
figure_index=f.figure_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.caption = f.caption
|
||||
existing.description = (
|
||||
f"Figure detected on page {f.page_number}." if not f.caption else
|
||||
f"Figure on page {f.page_number}. Caption: {f.caption}"
|
||||
)
|
||||
|
||||
if f.image_bytes:
|
||||
key = key_figure_crop(document_id, f.page_number, f.figure_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=f.image_bytes,
|
||||
content_type=f"image/{f.image_ext}",
|
||||
)
|
||||
existing.storage_bucket = storage.derived_bucket
|
||||
existing.storage_key = key
|
||||
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
12
app/ingestion/normalizer.py
Normal file
12
app/ingestion/normalizer.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Block-level normalization wrappers around utils.text_cleaning."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.utils.text_cleaning import clean_ocr_text, normalize_for_search
|
||||
|
||||
|
||||
def normalize_block(text: str) -> tuple[str, str]:
|
||||
"""Return ``(display_text, normalized_text)``."""
|
||||
display = clean_ocr_text(text)
|
||||
norm = normalize_for_search(display)
|
||||
return display, norm
|
||||
87
app/ingestion/ocr.py
Normal file
87
app/ingestion/ocr.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""OCRmyPDF integration with Tesseract.
|
||||
|
||||
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
|
||||
disabled by config), we skip OCR and use the original PDF. On failure, the
|
||||
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
from app.utils.pdf import has_searchable_text
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OcrResult:
|
||||
output_path: Path
|
||||
skipped: bool
|
||||
reason: str
|
||||
languages: str
|
||||
|
||||
|
||||
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
|
||||
"""Run OCRmyPDF.
|
||||
|
||||
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
|
||||
- If the input already has searchable text: skip OCR but still produce
|
||||
``output_pdf`` (a hard-link / copy to keep downstream code simple).
|
||||
- On unexpected exceptions: re-raise (caller handles status update).
|
||||
"""
|
||||
langs = languages or settings.ocr_languages
|
||||
|
||||
if not settings.ocr_enabled:
|
||||
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
|
||||
|
||||
if has_searchable_text(input_pdf):
|
||||
return _skip(input_pdf, output_pdf, langs, "already_searchable")
|
||||
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
|
||||
|
||||
try:
|
||||
ocrmypdf.ocr(
|
||||
input_file=str(input_pdf),
|
||||
output_file=str(output_pdf),
|
||||
language=langs,
|
||||
skip_text=False,
|
||||
redo_ocr=False,
|
||||
force_ocr=False,
|
||||
deskew=settings.ocr_deskew,
|
||||
clean=settings.ocr_clean,
|
||||
optimize=settings.ocr_optimize,
|
||||
progress_bar=False,
|
||||
jobs=1,
|
||||
output_type="pdf",
|
||||
# tolerate already-OCR pages where present
|
||||
skip_big=200.0,
|
||||
)
|
||||
except ocrmypdf.exceptions.PriorOcrFoundError:
|
||||
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
|
||||
except ocrmypdf.exceptions.DigitalSignatureError:
|
||||
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
|
||||
except ocrmypdf.exceptions.EncryptedPdfError as exc:
|
||||
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
|
||||
raise
|
||||
except ocrmypdf.exceptions.MissingDependencyError as exc:
|
||||
logger.error("ocr.missing_dependency", error=str(exc))
|
||||
raise
|
||||
|
||||
logger.info("ocr.done", output=str(output_pdf))
|
||||
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
|
||||
|
||||
|
||||
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
|
||||
output_pdf.write_bytes(input_pdf.read_bytes())
|
||||
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)
|
||||
384
app/ingestion/pipeline.py
Normal file
384
app/ingestion/pipeline.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""Per-document end-to-end pipeline: OCR -> Docling -> chunk -> persist -> index.
|
||||
|
||||
Called by the Celery worker. Idempotent: re-running on the same document deletes
|
||||
existing chunks for that document and re-creates them, then re-indexes in
|
||||
OpenSearch and Qdrant.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import delete, select
|
||||
|
||||
from app.config import settings
|
||||
from app.db.models import (
|
||||
ArtifactType,
|
||||
Chunk,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentStatus,
|
||||
Page,
|
||||
ProcessingEvent,
|
||||
)
|
||||
from app.db.session import session_scope
|
||||
from app.indexing import opensearch_client, qdrant_client
|
||||
from app.indexing.embeddings import get_embedder
|
||||
from app.ingestion.chunker import ChunkRecord, chunk_extraction
|
||||
from app.ingestion.docling_extractor import ExtractionResult, extract
|
||||
from app.ingestion.figure_processor import persist_figures
|
||||
from app.ingestion.ocr import run_ocr
|
||||
from app.ingestion.table_processor import persist_tables
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import (
|
||||
key_docling_json,
|
||||
key_markdown,
|
||||
key_ocr_pdf,
|
||||
work_dir_for,
|
||||
)
|
||||
from app.storage.minio_client import get_storage
|
||||
from app.utils.language import detect_language
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def process_document_id(document_id: uuid.UUID, run_id: uuid.UUID | None = None) -> dict[str, Any]:
|
||||
"""Top-level entry called by the Celery task. Wraps the pipeline in
|
||||
error handling so the task always either succeeds or marks the document FAILED.
|
||||
"""
|
||||
storage = get_storage()
|
||||
storage.ensure_buckets()
|
||||
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
logger.warning("pipeline.document_missing", document_id=str(document_id))
|
||||
return {"status": "missing"}
|
||||
|
||||
source_path = Path(doc.source_path)
|
||||
sha = doc.sha256
|
||||
original_artifact = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == doc.id,
|
||||
DocumentArtifact.artifact_type == ArtifactType.ORIGINAL_PDF,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
work_dir = work_dir_for(document_id)
|
||||
local_pdf = work_dir / f"{sha}.pdf"
|
||||
if not local_pdf.exists():
|
||||
if source_path.exists():
|
||||
local_pdf.write_bytes(source_path.read_bytes())
|
||||
elif original_artifact:
|
||||
storage.get_to_path(original_artifact.storage_bucket, original_artifact.storage_key, local_pdf)
|
||||
else:
|
||||
return _fail(document_id, run_id, "OCR_FAILED", "Original PDF not available locally or in MinIO")
|
||||
|
||||
# ---------------- OCR ----------------
|
||||
ocr_pdf = work_dir / "ocr.pdf"
|
||||
try:
|
||||
_emit_event(document_id, run_id, DocumentStatus.OCR_STARTED, "OCR started")
|
||||
ocr_result = run_ocr(local_pdf, ocr_pdf, languages=settings.ocr_languages)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.ocr_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.OCR_FAILED, f"OCR failed: {exc}")
|
||||
|
||||
# Upload OCR PDF (even if we 'skipped' it - OCR PDF is the canonical input to Docling).
|
||||
ocr_key = key_ocr_pdf(document_id)
|
||||
storage.put_file(
|
||||
bucket=storage.derived_bucket,
|
||||
key=ocr_key,
|
||||
path=ocr_result.output_path,
|
||||
content_type="application/pdf",
|
||||
)
|
||||
with session_scope() as db:
|
||||
_ensure_artifact(db, document_id, ArtifactType.OCR_PDF, storage.derived_bucket, ocr_key)
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.OCR_COMPLETED,
|
||||
level="INFO",
|
||||
message=f"OCR finished ({ocr_result.reason})",
|
||||
data={"skipped": ocr_result.skipped, "languages": ocr_result.languages},
|
||||
)
|
||||
)
|
||||
|
||||
# ---------------- Docling ----------------
|
||||
try:
|
||||
_emit_event(document_id, run_id, DocumentStatus.EXTRACTION_STARTED, "Docling extraction started")
|
||||
extraction = extract(ocr_result.output_path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.docling_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.EXTRACTION_FAILED, f"Docling failed: {exc}")
|
||||
|
||||
# Persist Markdown + JSON to MinIO.
|
||||
md_key = key_markdown(document_id)
|
||||
json_key = key_docling_json(document_id)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=md_key,
|
||||
data=extraction.markdown.encode("utf-8"),
|
||||
content_type="text/markdown",
|
||||
)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=json_key,
|
||||
data=json.dumps(extraction.json_payload, ensure_ascii=False).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
# ---------------- Persist pages, chunks, tables, figures ----------------
|
||||
chunk_records = chunk_extraction(extraction)
|
||||
sample_text = "\n".join(p.text for p in extraction.pages[:3] if p.text)
|
||||
lang = detect_language(sample_text)
|
||||
|
||||
with session_scope() as db:
|
||||
_ensure_artifact(db, document_id, ArtifactType.MARKDOWN, storage.derived_bucket, md_key)
|
||||
_ensure_artifact(db, document_id, ArtifactType.DOCLING_JSON, storage.derived_bucket, json_key)
|
||||
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
return {"status": "missing"}
|
||||
doc.status = DocumentStatus.EXTRACTION_COMPLETED
|
||||
if lang and not doc.language_hint:
|
||||
doc.language_hint = lang
|
||||
|
||||
page_id_by_number = _upsert_pages(db, document_id, extraction)
|
||||
persist_tables(db, storage, document_id, extraction.tables, page_id_by_number)
|
||||
persist_figures(db, storage, document_id, extraction.figures, page_id_by_number)
|
||||
|
||||
# Replace chunks idempotently: drop all and re-insert.
|
||||
db.execute(delete(Chunk).where(Chunk.document_id == document_id))
|
||||
for cr in chunk_records:
|
||||
db.add(_to_chunk_row(document_id, page_id_by_number, cr))
|
||||
|
||||
doc.status = DocumentStatus.CHUNKING_COMPLETED
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.CHUNKING_COMPLETED,
|
||||
level="INFO",
|
||||
message="Chunking complete",
|
||||
data={"chunks": len(chunk_records)},
|
||||
)
|
||||
)
|
||||
|
||||
# ---------------- Indexing (OpenSearch + Qdrant) ----------------
|
||||
try:
|
||||
opensearch_client.ensure_index()
|
||||
qdrant_client.ensure_collection()
|
||||
opensearch_client.delete_by_document(str(document_id))
|
||||
qdrant_client.delete_by_document(str(document_id))
|
||||
|
||||
os_docs, qdrant_points = _build_index_payloads(document_id, chunk_records, extraction, lang)
|
||||
if os_docs:
|
||||
opensearch_client.index_chunks(os_docs)
|
||||
if qdrant_points:
|
||||
embedder = get_embedder()
|
||||
texts_to_embed = [text for _, text, _ in qdrant_points]
|
||||
vectors = embedder.encode(texts_to_embed)
|
||||
triples = [
|
||||
(chunk_id, vec, payload)
|
||||
for (chunk_id, _text, payload), vec in zip(qdrant_points, vectors, strict=True)
|
||||
]
|
||||
qdrant_client.upsert_chunks(triples)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.indexing_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.FAILED, f"Indexing failed: {exc}")
|
||||
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = DocumentStatus.INDEXING_COMPLETED
|
||||
doc.error_message = None
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.INDEXING_COMPLETED,
|
||||
level="INFO",
|
||||
message="Indexing complete",
|
||||
data={"chunks": len(chunk_records)},
|
||||
)
|
||||
)
|
||||
|
||||
return {"status": DocumentStatus.INDEXING_COMPLETED, "chunks": len(chunk_records)}
|
||||
|
||||
|
||||
# ---------------- helpers ----------------
|
||||
|
||||
def _to_chunk_row(
|
||||
document_id: uuid.UUID, page_id_by_number: dict[int, uuid.UUID], cr: ChunkRecord
|
||||
) -> Chunk:
|
||||
return Chunk(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(cr.page_number),
|
||||
page_number=cr.page_number,
|
||||
block_id=cr.block_id,
|
||||
chunk_index=cr.chunk_index,
|
||||
block_type=cr.block_type,
|
||||
text=cr.text,
|
||||
normalized_text=cr.normalized_text,
|
||||
token_count=cr.token_count,
|
||||
ocr_confidence=None,
|
||||
quality_flags=cr.quality_flags,
|
||||
chunk_metadata=cr.metadata,
|
||||
)
|
||||
|
||||
|
||||
def _upsert_pages(db, document_id: uuid.UUID, extraction: ExtractionResult) -> dict[int, uuid.UUID]:
|
||||
existing = {
|
||||
p.page_number: p
|
||||
for p in db.execute(select(Page).where(Page.document_id == document_id)).scalars()
|
||||
}
|
||||
out: dict[int, uuid.UUID] = {}
|
||||
for ep in extraction.pages:
|
||||
page = existing.get(ep.page_number)
|
||||
if page is None:
|
||||
page = Page(
|
||||
document_id=document_id,
|
||||
page_number=ep.page_number,
|
||||
text=ep.text,
|
||||
ocr_confidence=ep.ocr_confidence,
|
||||
has_tables=ep.has_tables,
|
||||
has_figures=ep.has_figures,
|
||||
has_handwriting=ep.has_handwriting,
|
||||
)
|
||||
db.add(page)
|
||||
db.flush()
|
||||
else:
|
||||
page.text = ep.text
|
||||
page.has_tables = ep.has_tables
|
||||
page.has_figures = ep.has_figures
|
||||
page.has_handwriting = ep.has_handwriting
|
||||
out[ep.page_number] = page.id
|
||||
return out
|
||||
|
||||
|
||||
def _build_index_payloads(
|
||||
document_id: uuid.UUID,
|
||||
chunks: list[ChunkRecord],
|
||||
extraction: ExtractionResult,
|
||||
language_hint: str | None,
|
||||
) -> tuple[list[dict[str, Any]], list[tuple[str, str, dict[str, Any]]]]:
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
return [], []
|
||||
original_file_name = doc.original_file_name
|
||||
source_path = doc.source_path
|
||||
|
||||
chunk_rows = (
|
||||
db.execute(select(Chunk).where(Chunk.document_id == document_id))
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
os_docs: list[dict[str, Any]] = []
|
||||
qdrant: list[tuple[str, str, dict[str, Any]]] = []
|
||||
|
||||
for row in chunk_rows:
|
||||
chunk_id = str(row.id)
|
||||
text = row.text or ""
|
||||
os_docs.append(
|
||||
{
|
||||
"chunk_id": chunk_id,
|
||||
"document_id": str(document_id),
|
||||
"source_path": source_path,
|
||||
"original_file_name": original_file_name,
|
||||
"page_number": row.page_number,
|
||||
"block_type": row.block_type,
|
||||
"block_id": row.block_id,
|
||||
"text": text,
|
||||
"normalized_text": row.normalized_text,
|
||||
"ocr_confidence": row.ocr_confidence,
|
||||
"language_hint": language_hint,
|
||||
"metadata": row.chunk_metadata or {},
|
||||
"quality_flags": row.quality_flags or {},
|
||||
"created_at": (row.created_at or datetime.now(tz=timezone.utc)).isoformat(),
|
||||
}
|
||||
)
|
||||
text_preview = text[:512]
|
||||
qdrant.append(
|
||||
(
|
||||
chunk_id,
|
||||
text,
|
||||
{
|
||||
"document_id": str(document_id),
|
||||
"source_path": source_path,
|
||||
"original_file_name": original_file_name,
|
||||
"page_number": row.page_number,
|
||||
"block_type": row.block_type,
|
||||
"block_id": row.block_id,
|
||||
"text_preview": text_preview,
|
||||
"ocr_confidence": row.ocr_confidence,
|
||||
"quality_flags": row.quality_flags or {},
|
||||
"metadata": row.chunk_metadata or {},
|
||||
},
|
||||
)
|
||||
)
|
||||
return os_docs, qdrant
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _emit_event(document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str) -> None:
|
||||
with session_scope() as db:
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=stage,
|
||||
level="INFO",
|
||||
message=message,
|
||||
data={},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _fail(
|
||||
document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str
|
||||
) -> dict[str, Any]:
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = stage
|
||||
doc.error_message = message[:2000]
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=stage,
|
||||
level="ERROR",
|
||||
message=message,
|
||||
data={},
|
||||
)
|
||||
)
|
||||
logger.error("pipeline.failed", document_id=str(document_id), stage=stage, message=message)
|
||||
return {"status": stage, "error": message}
|
||||
41
app/ingestion/quality.py
Normal file
41
app/ingestion/quality.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Quality flag computation for chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from app.utils.text_cleaning import looks_garbled
|
||||
|
||||
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
|
||||
SHORT_TEXT_THRESHOLD = 24
|
||||
|
||||
|
||||
def compute_quality_flags(
|
||||
*,
|
||||
text: str,
|
||||
block_type: str,
|
||||
ocr_confidence: float | None,
|
||||
has_handwriting: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
flags: dict[str, Any] = {
|
||||
"low_ocr_confidence": False,
|
||||
"very_short_text": False,
|
||||
"possible_garbled_text": False,
|
||||
"table_detected": block_type == "table",
|
||||
"figure_detected": block_type in ("figure_caption", "figure_description"),
|
||||
"handwriting_detected": has_handwriting or block_type == "handwriting",
|
||||
"needs_manual_review": False,
|
||||
}
|
||||
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
|
||||
flags["low_ocr_confidence"] = True
|
||||
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
|
||||
flags["very_short_text"] = True
|
||||
if looks_garbled(text):
|
||||
flags["possible_garbled_text"] = True
|
||||
if (
|
||||
flags["low_ocr_confidence"]
|
||||
or flags["possible_garbled_text"]
|
||||
or flags["handwriting_detected"]
|
||||
):
|
||||
flags["needs_manual_review"] = True
|
||||
return flags
|
||||
184
app/ingestion/scanner.py
Normal file
184
app/ingestion/scanner.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Folder scanner: discovers PDFs, deduplicates by SHA256, persists discovery rows.
|
||||
|
||||
The scanner does NOT trigger OCR or extraction. It only:
|
||||
- enumerates PDF files,
|
||||
- hashes each file,
|
||||
- creates / reuses a ``Document`` row,
|
||||
- uploads the original PDF to MinIO,
|
||||
- emits ``DISCOVERED`` / ``STORED_ORIGINAL`` events.
|
||||
|
||||
Heavy work (OCR, Docling, indexing) is performed by the Celery worker pipeline.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import uuid
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import (
|
||||
ArtifactType,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentStatus,
|
||||
ProcessingEvent,
|
||||
)
|
||||
from app.db.session import session_scope
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_original_pdf
|
||||
from app.storage.minio_client import get_storage
|
||||
from app.utils.hashing import sha256_file
|
||||
from app.utils.pdf import is_pdf
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscoveryRecord:
|
||||
path: Path
|
||||
sha256: str | None
|
||||
document_id: uuid.UUID | None
|
||||
duplicate: bool
|
||||
invalid: bool = False
|
||||
|
||||
|
||||
def iter_pdf_files(root: Path, recursive: bool = True) -> Iterator[Path]:
|
||||
if root.is_file():
|
||||
if is_pdf(root):
|
||||
yield root
|
||||
return
|
||||
if recursive:
|
||||
for dirpath, _dirnames, filenames in os.walk(root):
|
||||
for name in filenames:
|
||||
p = Path(dirpath) / name
|
||||
if is_pdf(p):
|
||||
yield p
|
||||
else:
|
||||
for p in root.iterdir():
|
||||
if is_pdf(p):
|
||||
yield p
|
||||
|
||||
|
||||
def discover_documents(
|
||||
root: Path, recursive: bool = True, force: bool = False
|
||||
) -> Iterator[DiscoveryRecord]:
|
||||
storage = get_storage()
|
||||
storage.ensure_buckets()
|
||||
|
||||
for path in iter_pdf_files(root, recursive=recursive):
|
||||
try:
|
||||
stat = path.stat()
|
||||
sha = sha256_file(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("scan.invalid_file", path=str(path), error=str(exc))
|
||||
yield DiscoveryRecord(path=path, sha256=None, document_id=None, duplicate=False, invalid=True)
|
||||
continue
|
||||
|
||||
with session_scope() as db:
|
||||
existing = db.execute(
|
||||
select(Document).where(Document.sha256 == sha)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if existing and not force:
|
||||
logger.debug("scan.duplicate", path=str(path), sha256=sha, document_id=str(existing.id))
|
||||
yield DiscoveryRecord(path=path, sha256=sha, document_id=existing.id, duplicate=True)
|
||||
continue
|
||||
|
||||
doc = existing or Document(
|
||||
id=uuid.uuid4(),
|
||||
source_path=str(path),
|
||||
original_file_name=path.name,
|
||||
sha256=sha,
|
||||
file_size_bytes=stat.st_size,
|
||||
mime_type="application/pdf",
|
||||
status=DocumentStatus.DISCOVERED,
|
||||
)
|
||||
if not existing:
|
||||
db.add(doc)
|
||||
db.flush()
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage=DocumentStatus.DISCOVERED,
|
||||
level="INFO",
|
||||
message="Document discovered",
|
||||
data={"sha256": sha, "size": stat.st_size, "path": str(path)},
|
||||
)
|
||||
)
|
||||
|
||||
# Upload original (idempotent) and record artifact if missing.
|
||||
key = key_original_pdf(doc.id, sha)
|
||||
try:
|
||||
if not storage.exists(storage.originals_bucket, key):
|
||||
storage.put_file(
|
||||
bucket=storage.originals_bucket,
|
||||
key=key,
|
||||
path=path,
|
||||
content_type="application/pdf",
|
||||
metadata={"sha256": sha, "original-name": path.name[:255]},
|
||||
)
|
||||
_ensure_artifact(
|
||||
db,
|
||||
doc.id,
|
||||
ArtifactType.ORIGINAL_PDF,
|
||||
storage.originals_bucket,
|
||||
key,
|
||||
sha,
|
||||
)
|
||||
if doc.status == DocumentStatus.DISCOVERED:
|
||||
doc.status = DocumentStatus.STORED_ORIGINAL
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage=DocumentStatus.STORED_ORIGINAL,
|
||||
level="INFO",
|
||||
message="Original stored to MinIO",
|
||||
data={"bucket": storage.originals_bucket, "key": key},
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("scan.store_failed", path=str(path), error=str(exc))
|
||||
doc.status = DocumentStatus.FAILED
|
||||
doc.error_message = f"store_original: {exc}"
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage="STORE_FAILED",
|
||||
level="ERROR",
|
||||
message=str(exc),
|
||||
data={"path": str(path)},
|
||||
)
|
||||
)
|
||||
yield DiscoveryRecord(path=path, sha256=sha, document_id=None, duplicate=False, invalid=True)
|
||||
continue
|
||||
|
||||
yield DiscoveryRecord(
|
||||
path=path, sha256=sha, document_id=doc.id, duplicate=bool(existing)
|
||||
)
|
||||
|
||||
|
||||
def _ensure_artifact(
|
||||
db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
|
||||
) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.artifact_type == artifact_type,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
checksum=checksum,
|
||||
)
|
||||
)
|
||||
84
app/ingestion/table_processor.py
Normal file
84
app/ingestion/table_processor.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Persists Docling tables to PostgreSQL + MinIO."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Table
|
||||
from app.ingestion.docling_extractor import ExtractedTable
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_table_json
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_tables(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
tables: list[ExtractedTable],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for t in tables:
|
||||
existing = db.execute(
|
||||
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Table(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(t.page_number),
|
||||
page_number=t.page_number,
|
||||
table_index=t.table_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.markdown = t.markdown or ""
|
||||
existing.csv_text = t.csv_text
|
||||
existing.json_data = t.json_data
|
||||
existing.summary = _summary(t)
|
||||
db.flush()
|
||||
|
||||
# Persist json blob to MinIO for large/inspectable copies.
|
||||
if t.json_data:
|
||||
key = key_table_json(document_id, t.table_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
)
|
||||
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _summary(t: ExtractedTable) -> str:
|
||||
md = t.markdown or ""
|
||||
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
||||
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user