chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

45
tests/test_quality.py Normal file
View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from app.ingestion.quality import compute_quality_flags
from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search
def test_quality_low_confidence_flags_review():
flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4)
assert flags["low_ocr_confidence"] is True
assert flags["needs_manual_review"] is True
def test_quality_short_text():
flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95)
assert flags["very_short_text"] is True
assert flags["needs_manual_review"] is False
def test_quality_handwriting_forces_review():
flags = compute_quality_flags(
text="неразборчивый текст",
block_type="handwriting",
ocr_confidence=0.9,
has_handwriting=True,
)
assert flags["handwriting_detected"] is True
assert flags["needs_manual_review"] is True
def test_clean_ocr_text_removes_hyphenation():
raw = "инвен-\nтарный номер 123"
assert clean_ocr_text(raw) == "инвентарный номер 123"
def test_normalize_preserves_codes():
text = "ГОСТ 21.501-93 № 12/345"
norm = normalize_for_search(text)
# Standard codes survive normalization (digits, slashes, dashes, dots).
assert "21.501-93" in norm
assert "12/345" in norm
def test_looks_garbled_detects_noise():
assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5)
assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")