Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
from __future__ import annotations
|
|
|
|
from app.api.schemas import SearchMode
|
|
from app.indexing.hybrid_search import _Candidate, _merge
|
|
|
|
|
|
def _make(chunk_id: str, *, bm25_rank: int | None = None, dense_rank: int | None = None) -> _Candidate:
|
|
return _Candidate(
|
|
chunk_id=chunk_id,
|
|
document_id="00000000-0000-0000-0000-000000000000",
|
|
page_number=1,
|
|
block_type="paragraph",
|
|
block_id=None,
|
|
text=f"text-{chunk_id}",
|
|
source_path="/tmp/doc.pdf",
|
|
original_file_name="doc.pdf",
|
|
quality_flags={},
|
|
metadata={},
|
|
bm25_score=None if bm25_rank is None else 1.0 / bm25_rank,
|
|
bm25_rank=bm25_rank,
|
|
dense_score=None if dense_rank is None else 1.0 - 0.1 * dense_rank,
|
|
dense_rank=dense_rank,
|
|
)
|
|
|
|
|
|
def test_merge_lexical_passes_through():
|
|
lex = [_make("a", bm25_rank=1), _make("b", bm25_rank=2)]
|
|
out = _merge(lex, [], "lexical")
|
|
assert [c.chunk_id for c in out] == ["a", "b"]
|
|
|
|
|
|
def test_merge_hybrid_uses_rrf_to_rank_intersected_results_higher():
|
|
lex = [_make("a", bm25_rank=2), _make("b", bm25_rank=1)]
|
|
sem = [_make("a", dense_rank=1), _make("c", dense_rank=2)]
|
|
merged = _merge(lex, sem, "hybrid")
|
|
ids = [c.chunk_id for c in merged]
|
|
# ``a`` appears in both, so it should beat ``b`` and ``c`` after RRF.
|
|
assert ids[0] == "a"
|
|
assert set(ids) == {"a", "b", "c"}
|
|
|
|
|
|
def test_merge_hybrid_handles_disjoint_sets():
|
|
lex = [_make("x", bm25_rank=1)]
|
|
sem = [_make("y", dense_rank=1)]
|
|
merged = _merge(lex, sem, "hybrid")
|
|
assert {c.chunk_id for c in merged} == {"x", "y"}
|
|
|
|
|
|
def test_search_mode_typed():
|
|
# Smoke - the literal type accepts the three valid values.
|
|
valid: list[SearchMode] = ["lexical", "semantic", "hybrid"]
|
|
assert valid == ["lexical", "semantic", "hybrid"]
|