"""Quality flag computation for chunks.""" from __future__ import annotations from typing import Any from app.utils.text_cleaning import looks_garbled LOW_OCR_CONFIDENCE_THRESHOLD = 0.6 SHORT_TEXT_THRESHOLD = 24 def compute_quality_flags( *, text: str, block_type: str, ocr_confidence: float | None, has_handwriting: bool = False, ) -> dict[str, Any]: flags: dict[str, Any] = { "low_ocr_confidence": False, "very_short_text": False, "possible_garbled_text": False, "table_detected": block_type == "table", "figure_detected": block_type in ("figure_caption", "figure_description"), "handwriting_detected": has_handwriting or block_type == "handwriting", "needs_manual_review": False, } if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD: flags["low_ocr_confidence"] = True if text and len(text.strip()) < SHORT_TEXT_THRESHOLD: flags["very_short_text"] = True if looks_garbled(text): flags["possible_garbled_text"] = True if ( flags["low_ocr_confidence"] or flags["possible_garbled_text"] or flags["handwriting_detected"] ): flags["needs_manual_review"] = True return flags