chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

41
app/ingestion/quality.py Normal file
View File

@@ -0,0 +1,41 @@
"""Quality flag computation for chunks."""
from __future__ import annotations
from typing import Any
from app.utils.text_cleaning import looks_garbled
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
SHORT_TEXT_THRESHOLD = 24
def compute_quality_flags(
*,
text: str,
block_type: str,
ocr_confidence: float | None,
has_handwriting: bool = False,
) -> dict[str, Any]:
flags: dict[str, Any] = {
"low_ocr_confidence": False,
"very_short_text": False,
"possible_garbled_text": False,
"table_detected": block_type == "table",
"figure_detected": block_type in ("figure_caption", "figure_description"),
"handwriting_detected": has_handwriting or block_type == "handwriting",
"needs_manual_review": False,
}
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
flags["low_ocr_confidence"] = True
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
flags["very_short_text"] = True
if looks_garbled(text):
flags["possible_garbled_text"] = True
if (
flags["low_ocr_confidence"]
or flags["possible_garbled_text"]
or flags["handwriting_detected"]
):
flags["needs_manual_review"] = True
return flags