from __future__ import annotations from app.api.schemas import SearchMode from app.indexing.hybrid_search import _Candidate, _merge def _make(chunk_id: str, *, bm25_rank: int | None = None, dense_rank: int | None = None) -> _Candidate: return _Candidate( chunk_id=chunk_id, document_id="00000000-0000-0000-0000-000000000000", page_number=1, block_type="paragraph", block_id=None, text=f"text-{chunk_id}", source_path="/tmp/doc.pdf", original_file_name="doc.pdf", quality_flags={}, metadata={}, bm25_score=None if bm25_rank is None else 1.0 / bm25_rank, bm25_rank=bm25_rank, dense_score=None if dense_rank is None else 1.0 - 0.1 * dense_rank, dense_rank=dense_rank, ) def test_merge_lexical_passes_through(): lex = [_make("a", bm25_rank=1), _make("b", bm25_rank=2)] out = _merge(lex, [], "lexical") assert [c.chunk_id for c in out] == ["a", "b"] def test_merge_hybrid_uses_rrf_to_rank_intersected_results_higher(): lex = [_make("a", bm25_rank=2), _make("b", bm25_rank=1)] sem = [_make("a", dense_rank=1), _make("c", dense_rank=2)] merged = _merge(lex, sem, "hybrid") ids = [c.chunk_id for c in merged] # ``a`` appears in both, so it should beat ``b`` and ``c`` after RRF. assert ids[0] == "a" assert set(ids) == {"a", "b", "c"} def test_merge_hybrid_handles_disjoint_sets(): lex = [_make("x", bm25_rank=1)] sem = [_make("y", dense_rank=1)] merged = _merge(lex, sem, "hybrid") assert {c.chunk_id for c in merged} == {"x", "y"} def test_search_mode_typed(): # Smoke - the literal type accepts the three valid values. valid: list[SearchMode] = ["lexical", "semantic", "hybrid"] assert valid == ["lexical", "semantic", "hybrid"]