chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+"""Pytest configuration - put repository root on sys.path."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
--- a/tests/test_chunker.py
+++ b/tests/test_chunker.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from app.ingestion.chunker import chunk_extraction
+from app.ingestion.docling_extractor import (
+    ExtractedBlock,
+    ExtractedFigure,
+    ExtractedPage,
+    ExtractedTable,
+    ExtractionResult,
+)
+
+
+def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult:
+    return ExtractionResult(
+        markdown="",
+        json_payload={},
+        blocks=blocks or [],
+        tables=tables or [],
+        figures=figures or [],
+        pages=pages or [ExtractedPage(page_number=1, text="")],
+    )
+
+
+def test_chunker_emits_table_unsplit():
+    md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"
+    extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)])
+    chunks = chunk_extraction(extraction)
+    table_chunks = [c for c in chunks if c.block_type == "table"]
+    assert len(table_chunks) == 1
+    assert "| 1 | 2 |" in table_chunks[0].text
+    assert table_chunks[0].page_number == 2
+    assert table_chunks[0].quality_flags["table_detected"] is True
+
+
+def test_chunker_handles_paragraphs_with_section_heading():
+    extraction = _extraction(
+        blocks=[
+            ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"),
+            ExtractedBlock(
+                page_number=1,
+                block_type="paragraph",
+                text="Первый параграф документа " * 30,
+            ),
+            ExtractedBlock(
+                page_number=1,
+                block_type="paragraph",
+                text="Второй параграф продолжает тему " * 30,
+            ),
+        ]
+    )
+    chunks = chunk_extraction(extraction)
+    text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")]
+    assert text_chunks, "expected at least one narrative chunk"
+    # The section heading should be included as context in at least one chunk.
+    assert any("Глава 1" in c.text for c in text_chunks)
+    # Each chunk should carry citation metadata.
+    for c in text_chunks:
+        assert c.page_number == 1
+        assert c.quality_flags is not None
+        assert "needs_manual_review" in c.quality_flags
+
+
+def test_chunker_emits_figure_caption_chunks():
+    extraction = _extraction(
+        figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")]
+    )
+    chunks = chunk_extraction(extraction)
+    fig_chunks = [c for c in chunks if c.block_type.startswith("figure")]
+    assert fig_chunks
+    assert fig_chunks[0].page_number == 4
+    assert "Схема ремонта" in fig_chunks[0].text
+
+
+def test_chunker_splits_giant_block():
+    huge = " ".join(f"word{i}" for i in range(5000))
+    extraction = _extraction(
+        blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)]
+    )
+    chunks = chunk_extraction(extraction)
+    narrative = [c for c in chunks if c.block_type == "paragraph"]
+    assert len(narrative) >= 2  # the giant block must be split
--- a/tests/test_duplicates.py
+++ b/tests/test_duplicates.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from app.utils.hashing import sha256_file
+
+
+def test_two_files_with_same_content_share_sha(tmp_path: Path):
+    a = tmp_path / "a.pdf"
+    b = tmp_path / "b.pdf"
+    payload = b"%PDF-1.4\n" + b"x" * 4096
+    a.write_bytes(payload)
+    b.write_bytes(payload)
+    assert sha256_file(a) == sha256_file(b)
+
+
+def test_one_byte_difference_changes_sha(tmp_path: Path):
+    a = tmp_path / "a.pdf"
+    b = tmp_path / "b.pdf"
+    a.write_bytes(b"%PDF-1.4\n" + b"x" * 4096)
+    b.write_bytes(b"%PDF-1.4\n" + b"x" * 4095 + b"y")
+    assert sha256_file(a) != sha256_file(b)
--- a/tests/test_hashing.py
+++ b/tests/test_hashing.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+from app.utils.hashing import sha256_bytes, sha256_file
+
+
+def test_sha256_bytes_matches_hashlib():
+    data = b"legacyhub" * 1000
+    assert sha256_bytes(data) == hashlib.sha256(data).hexdigest()
+
+
+def test_sha256_file_streaming_matches_hashlib(tmp_path: Path):
+    payload = b"\x01\x02\x03" * 5_000_000  # 15 MiB - exercises chunking
+    target = tmp_path / "blob.bin"
+    target.write_bytes(payload)
+    assert sha256_file(target) == hashlib.sha256(payload).hexdigest()
+
+
+def test_sha256_file_distinguishes_content(tmp_path: Path):
+    a = tmp_path / "a.bin"
+    b = tmp_path / "b.bin"
+    a.write_bytes(b"alpha")
+    b.write_bytes(b"beta")
+    assert sha256_file(a) != sha256_file(b)
--- a/tests/test_hybrid_search.py
+++ b/tests/test_hybrid_search.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from app.api.schemas import SearchMode
+from app.indexing.hybrid_search import _Candidate, _merge
+
+
+def _make(chunk_id: str, *, bm25_rank: int | None = None, dense_rank: int | None = None) -> _Candidate:
+    return _Candidate(
+        chunk_id=chunk_id,
+        document_id="00000000-0000-0000-0000-000000000000",
+        page_number=1,
+        block_type="paragraph",
+        block_id=None,
+        text=f"text-{chunk_id}",
+        source_path="/tmp/doc.pdf",
+        original_file_name="doc.pdf",
+        quality_flags={},
+        metadata={},
+        bm25_score=None if bm25_rank is None else 1.0 / bm25_rank,
+        bm25_rank=bm25_rank,
+        dense_score=None if dense_rank is None else 1.0 - 0.1 * dense_rank,
+        dense_rank=dense_rank,
+    )
+
+
+def test_merge_lexical_passes_through():
+    lex = [_make("a", bm25_rank=1), _make("b", bm25_rank=2)]
+    out = _merge(lex, [], "lexical")
+    assert [c.chunk_id for c in out] == ["a", "b"]
+
+
+def test_merge_hybrid_uses_rrf_to_rank_intersected_results_higher():
+    lex = [_make("a", bm25_rank=2), _make("b", bm25_rank=1)]
+    sem = [_make("a", dense_rank=1), _make("c", dense_rank=2)]
+    merged = _merge(lex, sem, "hybrid")
+    ids = [c.chunk_id for c in merged]
+    # ``a`` appears in both, so it should beat ``b`` and ``c`` after RRF.
+    assert ids[0] == "a"
+    assert set(ids) == {"a", "b", "c"}
+
+
+def test_merge_hybrid_handles_disjoint_sets():
+    lex = [_make("x", bm25_rank=1)]
+    sem = [_make("y", dense_rank=1)]
+    merged = _merge(lex, sem, "hybrid")
+    assert {c.chunk_id for c in merged} == {"x", "y"}
+
+
+def test_search_mode_typed():
+    # Smoke - the literal type accepts the three valid values.
+    valid: list[SearchMode] = ["lexical", "semantic", "hybrid"]
+    assert valid == ["lexical", "semantic", "hybrid"]
--- a/tests/test_quality.py
+++ b/tests/test_quality.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from app.ingestion.quality import compute_quality_flags
+from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search
+
+
+def test_quality_low_confidence_flags_review():
+    flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4)
+    assert flags["low_ocr_confidence"] is True
+    assert flags["needs_manual_review"] is True
+
+
+def test_quality_short_text():
+    flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95)
+    assert flags["very_short_text"] is True
+    assert flags["needs_manual_review"] is False
+
+
+def test_quality_handwriting_forces_review():
+    flags = compute_quality_flags(
+        text="неразборчивый текст",
+        block_type="handwriting",
+        ocr_confidence=0.9,
+        has_handwriting=True,
+    )
+    assert flags["handwriting_detected"] is True
+    assert flags["needs_manual_review"] is True
+
+
+def test_clean_ocr_text_removes_hyphenation():
+    raw = "инвен-\nтарный номер 123"
+    assert clean_ocr_text(raw) == "инвентарный номер 123"
+
+
+def test_normalize_preserves_codes():
+    text = "ГОСТ 21.501-93 № 12/345"
+    norm = normalize_for_search(text)
+    # Standard codes survive normalization (digits, slashes, dashes, dots).
+    assert "21.501-93" in norm
+    assert "12/345" in norm
+
+
+def test_looks_garbled_detects_noise():
+    assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5)
+    assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")