chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

0
tests/__init__.py Normal file
View File

10
tests/conftest.py Normal file
View File

@@ -0,0 +1,10 @@
"""Pytest configuration - put repository root on sys.path."""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))

81
tests/test_chunker.py Normal file
View File

@@ -0,0 +1,81 @@
from __future__ import annotations
from app.ingestion.chunker import chunk_extraction
from app.ingestion.docling_extractor import (
ExtractedBlock,
ExtractedFigure,
ExtractedPage,
ExtractedTable,
ExtractionResult,
)
def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult:
return ExtractionResult(
markdown="",
json_payload={},
blocks=blocks or [],
tables=tables or [],
figures=figures or [],
pages=pages or [ExtractedPage(page_number=1, text="")],
)
def test_chunker_emits_table_unsplit():
md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"
extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)])
chunks = chunk_extraction(extraction)
table_chunks = [c for c in chunks if c.block_type == "table"]
assert len(table_chunks) == 1
assert "| 1 | 2 |" in table_chunks[0].text
assert table_chunks[0].page_number == 2
assert table_chunks[0].quality_flags["table_detected"] is True
def test_chunker_handles_paragraphs_with_section_heading():
extraction = _extraction(
blocks=[
ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"),
ExtractedBlock(
page_number=1,
block_type="paragraph",
text="Первый параграф документа " * 30,
),
ExtractedBlock(
page_number=1,
block_type="paragraph",
text="Второй параграф продолжает тему " * 30,
),
]
)
chunks = chunk_extraction(extraction)
text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")]
assert text_chunks, "expected at least one narrative chunk"
# The section heading should be included as context in at least one chunk.
assert any("Глава 1" in c.text for c in text_chunks)
# Each chunk should carry citation metadata.
for c in text_chunks:
assert c.page_number == 1
assert c.quality_flags is not None
assert "needs_manual_review" in c.quality_flags
def test_chunker_emits_figure_caption_chunks():
extraction = _extraction(
figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")]
)
chunks = chunk_extraction(extraction)
fig_chunks = [c for c in chunks if c.block_type.startswith("figure")]
assert fig_chunks
assert fig_chunks[0].page_number == 4
assert "Схема ремонта" in fig_chunks[0].text
def test_chunker_splits_giant_block():
huge = " ".join(f"word{i}" for i in range(5000))
extraction = _extraction(
blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)]
)
chunks = chunk_extraction(extraction)
narrative = [c for c in chunks if c.block_type == "paragraph"]
assert len(narrative) >= 2 # the giant block must be split

22
tests/test_duplicates.py Normal file
View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from pathlib import Path
from app.utils.hashing import sha256_file
def test_two_files_with_same_content_share_sha(tmp_path: Path):
a = tmp_path / "a.pdf"
b = tmp_path / "b.pdf"
payload = b"%PDF-1.4\n" + b"x" * 4096
a.write_bytes(payload)
b.write_bytes(payload)
assert sha256_file(a) == sha256_file(b)
def test_one_byte_difference_changes_sha(tmp_path: Path):
a = tmp_path / "a.pdf"
b = tmp_path / "b.pdf"
a.write_bytes(b"%PDF-1.4\n" + b"x" * 4096)
b.write_bytes(b"%PDF-1.4\n" + b"x" * 4095 + b"y")
assert sha256_file(a) != sha256_file(b)

26
tests/test_hashing.py Normal file
View File

@@ -0,0 +1,26 @@
from __future__ import annotations
import hashlib
from pathlib import Path
from app.utils.hashing import sha256_bytes, sha256_file
def test_sha256_bytes_matches_hashlib():
data = b"legacyhub" * 1000
assert sha256_bytes(data) == hashlib.sha256(data).hexdigest()
def test_sha256_file_streaming_matches_hashlib(tmp_path: Path):
payload = b"\x01\x02\x03" * 5_000_000 # 15 MiB - exercises chunking
target = tmp_path / "blob.bin"
target.write_bytes(payload)
assert sha256_file(target) == hashlib.sha256(payload).hexdigest()
def test_sha256_file_distinguishes_content(tmp_path: Path):
a = tmp_path / "a.bin"
b = tmp_path / "b.bin"
a.write_bytes(b"alpha")
b.write_bytes(b"beta")
assert sha256_file(a) != sha256_file(b)

View File

@@ -0,0 +1,52 @@
from __future__ import annotations
from app.api.schemas import SearchMode
from app.indexing.hybrid_search import _Candidate, _merge
def _make(chunk_id: str, *, bm25_rank: int | None = None, dense_rank: int | None = None) -> _Candidate:
return _Candidate(
chunk_id=chunk_id,
document_id="00000000-0000-0000-0000-000000000000",
page_number=1,
block_type="paragraph",
block_id=None,
text=f"text-{chunk_id}",
source_path="/tmp/doc.pdf",
original_file_name="doc.pdf",
quality_flags={},
metadata={},
bm25_score=None if bm25_rank is None else 1.0 / bm25_rank,
bm25_rank=bm25_rank,
dense_score=None if dense_rank is None else 1.0 - 0.1 * dense_rank,
dense_rank=dense_rank,
)
def test_merge_lexical_passes_through():
lex = [_make("a", bm25_rank=1), _make("b", bm25_rank=2)]
out = _merge(lex, [], "lexical")
assert [c.chunk_id for c in out] == ["a", "b"]
def test_merge_hybrid_uses_rrf_to_rank_intersected_results_higher():
lex = [_make("a", bm25_rank=2), _make("b", bm25_rank=1)]
sem = [_make("a", dense_rank=1), _make("c", dense_rank=2)]
merged = _merge(lex, sem, "hybrid")
ids = [c.chunk_id for c in merged]
# ``a`` appears in both, so it should beat ``b`` and ``c`` after RRF.
assert ids[0] == "a"
assert set(ids) == {"a", "b", "c"}
def test_merge_hybrid_handles_disjoint_sets():
lex = [_make("x", bm25_rank=1)]
sem = [_make("y", dense_rank=1)]
merged = _merge(lex, sem, "hybrid")
assert {c.chunk_id for c in merged} == {"x", "y"}
def test_search_mode_typed():
# Smoke - the literal type accepts the three valid values.
valid: list[SearchMode] = ["lexical", "semantic", "hybrid"]
assert valid == ["lexical", "semantic", "hybrid"]

45
tests/test_quality.py Normal file
View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from app.ingestion.quality import compute_quality_flags
from app.utils.text_cleaning import clean_ocr_text, looks_garbled, normalize_for_search
def test_quality_low_confidence_flags_review():
flags = compute_quality_flags(text="hello world", block_type="paragraph", ocr_confidence=0.4)
assert flags["low_ocr_confidence"] is True
assert flags["needs_manual_review"] is True
def test_quality_short_text():
flags = compute_quality_flags(text="abc", block_type="paragraph", ocr_confidence=0.95)
assert flags["very_short_text"] is True
assert flags["needs_manual_review"] is False
def test_quality_handwriting_forces_review():
flags = compute_quality_flags(
text="неразборчивый текст",
block_type="handwriting",
ocr_confidence=0.9,
has_handwriting=True,
)
assert flags["handwriting_detected"] is True
assert flags["needs_manual_review"] is True
def test_clean_ocr_text_removes_hyphenation():
raw = "инвен-\nтарный номер 123"
assert clean_ocr_text(raw) == "инвентарный номер 123"
def test_normalize_preserves_codes():
text = "ГОСТ 21.501-93 № 12/345"
norm = normalize_for_search(text)
# Standard codes survive normalization (digits, slashes, dashes, dots).
assert "21.501-93" in norm
assert "12/345" in norm
def test_looks_garbled_detects_noise():
assert looks_garbled("@@##$$%%^^&&**(()_)(*&^%$#@!" * 5)
assert not looks_garbled("Hello world, this is a perfectly ordinary line of text.")