Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
"""Quality flag computation for chunks."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from app.utils.text_cleaning import looks_garbled
|
|
|
|
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
|
|
SHORT_TEXT_THRESHOLD = 24
|
|
|
|
|
|
def compute_quality_flags(
|
|
*,
|
|
text: str,
|
|
block_type: str,
|
|
ocr_confidence: float | None,
|
|
has_handwriting: bool = False,
|
|
) -> dict[str, Any]:
|
|
flags: dict[str, Any] = {
|
|
"low_ocr_confidence": False,
|
|
"very_short_text": False,
|
|
"possible_garbled_text": False,
|
|
"table_detected": block_type == "table",
|
|
"figure_detected": block_type in ("figure_caption", "figure_description"),
|
|
"handwriting_detected": has_handwriting or block_type == "handwriting",
|
|
"needs_manual_review": False,
|
|
}
|
|
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
|
|
flags["low_ocr_confidence"] = True
|
|
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
|
|
flags["very_short_text"] = True
|
|
if looks_garbled(text):
|
|
flags["possible_garbled_text"] = True
|
|
if (
|
|
flags["low_ocr_confidence"]
|
|
or flags["possible_garbled_text"]
|
|
or flags["handwriting_detected"]
|
|
):
|
|
flags["needs_manual_review"] = True
|
|
return flags
|