perf(reranker): add benchmark harness and passage clipping

- scripts/benchmark_reranker.py exercises the configured reranker with synthetic queries or live OpenSearch samples and prints p50/p95/p99 latency, mean latency, and pairs/sec throughput. Supports --warmup, --candidates, --passage-length, --source, and a --json-only mode for CI. - app/indexing/reranker.py clips passages to 2048 characters before scoring so a runaway chunk cannot starve the cross-encoder beyond bge-reranker-v2-m3's training window. - RUNBOOK.md gains a Reranker benchmark section with CPU/GPU SLO targets and a remediation ladder (lower top-K, raise batch size, switch device, disable reranker) when measured p95 exceeds budget. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 17:08:04 +03:00
parent f42fb978a8
commit 349f4ea838
3 changed files with 236 additions and 1 deletions
--- a/app/indexing/reranker.py
+++ b/app/indexing/reranker.py
@@ -50,10 +50,16 @@ class Reranker:
    def available(self) -> bool:
        return self._impl is not None and self._model is not None

+    # bge-reranker-v2-m3 is trained at 512 tokens; we truncate by chars so the
+    # reranker stays inside its budget even when callers forget to limit the
+    # candidate text length.
+    _MAX_PASSAGE_CHARS = 2048
+
    def score(self, query: str, passages: Sequence[str]) -> list[float]:
        if not self.available or not passages:
            return [0.0] * len(passages)
-        pairs = [(query, p) for p in passages]
+        clipped = [p[: self._MAX_PASSAGE_CHARS] for p in passages]
+        pairs = [(query, p) for p in clipped]
        if self._impl == "flagembedding":
            scores = self._model.compute_score(pairs, batch_size=self.batch_size, normalize=True)  # type: ignore[union-attr]
        else: