chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
45
tests/test_quality.py
Normal file
45
tests/test_quality.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.ingestion.quality import compute_quality_flags
|
||||
from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search
|
||||
|
||||
|
||||
def test_quality_low_confidence_flags_review():
|
||||
flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4)
|
||||
assert flags["low_ocr_confidence"] is True
|
||||
assert flags["needs_manual_review"] is True
|
||||
|
||||
|
||||
def test_quality_short_text():
|
||||
flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95)
|
||||
assert flags["very_short_text"] is True
|
||||
assert flags["needs_manual_review"] is False
|
||||
|
||||
|
||||
def test_quality_handwriting_forces_review():
|
||||
flags = compute_quality_flags(
|
||||
text="неразборчивый текст",
|
||||
block_type="handwriting",
|
||||
ocr_confidence=0.9,
|
||||
has_handwriting=True,
|
||||
)
|
||||
assert flags["handwriting_detected"] is True
|
||||
assert flags["needs_manual_review"] is True
|
||||
|
||||
|
||||
def test_clean_ocr_text_removes_hyphenation():
|
||||
raw = "инвен-\nтарный номер 123"
|
||||
assert clean_ocr_text(raw) == "инвентарный номер 123"
|
||||
|
||||
|
||||
def test_normalize_preserves_codes():
|
||||
text = "ГОСТ 21.501-93 № 12/345"
|
||||
norm = normalize_for_search(text)
|
||||
# Standard codes survive normalization (digits, slashes, dashes, dots).
|
||||
assert "21.501-93" in norm
|
||||
assert "12/345" in norm
|
||||
|
||||
|
||||
def test_looks_garbled_detects_noise():
|
||||
assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5)
|
||||
assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")
|
||||
Reference in New Issue
Block a user