from __future__ import annotations from app.ingestion.quality import compute_quality_flags from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search def test_quality_low_confidence_flags_review(): flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4) assert flags["low_ocr_confidence"] is True assert flags["needs_manual_review"] is True def test_quality_short_text(): flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95) assert flags["very_short_text"] is True assert flags["needs_manual_review"] is False def test_quality_handwriting_forces_review(): flags = compute_quality_flags( text="неразборчивый текст", block_type="handwriting", ocr_confidence=0.9, has_handwriting=True, ) assert flags["handwriting_detected"] is True assert flags["needs_manual_review"] is True def test_clean_ocr_text_removes_hyphenation(): raw = "инвен-\nтарный номер 123" assert clean_ocr_text(raw) == "инвентарный номер 123" def test_normalize_preserves_codes(): text = "ГОСТ 21.501-93 № 12/345" norm = normalize_for_search(text) # Standard codes survive normalization (digits, slashes, dashes, dots). assert "21.501-93" in norm assert "12/345" in norm def test_looks_garbled_detects_noise(): assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5) assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")