Files
LegacyHUB/tests/test_quality.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

46 lines
1.6 KiB
Python

from __future__ import annotations
from app.ingestion.quality import compute_quality_flags
from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search
def test_quality_low_confidence_flags_review():
flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4)
assert flags["low_ocr_confidence"] is True
assert flags["needs_manual_review"] is True
def test_quality_short_text():
flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95)
assert flags["very_short_text"] is True
assert flags["needs_manual_review"] is False
def test_quality_handwriting_forces_review():
flags = compute_quality_flags(
text="неразборчивый текст",
block_type="handwriting",
ocr_confidence=0.9,
has_handwriting=True,
)
assert flags["handwriting_detected"] is True
assert flags["needs_manual_review"] is True
def test_clean_ocr_text_removes_hyphenation():
raw = "инвен-\nтарный номер 123"
assert clean_ocr_text(raw) == "инвентарный номер 123"
def test_normalize_preserves_codes():
text = "ГОСТ 21.501-93 № 12/345"
norm = normalize_for_search(text)
# Standard codes survive normalization (digits, slashes, dashes, dots).
assert "21.501-93" in norm
assert "12/345" in norm
def test_looks_garbled_detects_noise():
assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5)
assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")