chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
41
app/ingestion/quality.py
Normal file
41
app/ingestion/quality.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Quality flag computation for chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from app.utils.text_cleaning import looks_garbled
|
||||
|
||||
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
|
||||
SHORT_TEXT_THRESHOLD = 24
|
||||
|
||||
|
||||
def compute_quality_flags(
|
||||
*,
|
||||
text: str,
|
||||
block_type: str,
|
||||
ocr_confidence: float | None,
|
||||
has_handwriting: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
flags: dict[str, Any] = {
|
||||
"low_ocr_confidence": False,
|
||||
"very_short_text": False,
|
||||
"possible_garbled_text": False,
|
||||
"table_detected": block_type == "table",
|
||||
"figure_detected": block_type in ("figure_caption", "figure_description"),
|
||||
"handwriting_detected": has_handwriting or block_type == "handwriting",
|
||||
"needs_manual_review": False,
|
||||
}
|
||||
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
|
||||
flags["low_ocr_confidence"] = True
|
||||
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
|
||||
flags["very_short_text"] = True
|
||||
if looks_garbled(text):
|
||||
flags["possible_garbled_text"] = True
|
||||
if (
|
||||
flags["low_ocr_confidence"]
|
||||
or flags["possible_garbled_text"]
|
||||
or flags["handwriting_detected"]
|
||||
):
|
||||
flags["needs_manual_review"] = True
|
||||
return flags
|
||||
Reference in New Issue
Block a user