Files
LegacyHUB/app/ingestion/quality.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

42 lines
1.2 KiB
Python

"""Quality flag computation for chunks."""
from __future__ import annotations
from typing import Any
from app.utils.text_cleaning import looks_garbled
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
SHORT_TEXT_THRESHOLD = 24
def compute_quality_flags(
*,
text: str,
block_type: str,
ocr_confidence: float | None,
has_handwriting: bool = False,
) -> dict[str, Any]:
flags: dict[str, Any] = {
"low_ocr_confidence": False,
"very_short_text": False,
"possible_garbled_text": False,
"table_detected": block_type == "table",
"figure_detected": block_type in ("figure_caption", "figure_description"),
"handwriting_detected": has_handwriting or block_type == "handwriting",
"needs_manual_review": False,
}
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
flags["low_ocr_confidence"] = True
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
flags["very_short_text"] = True
if looks_garbled(text):
flags["possible_garbled_text"] = True
if (
flags["low_ocr_confidence"]
or flags["possible_garbled_text"]
or flags["handwriting_detected"]
):
flags["needs_manual_review"] = True
return flags