perf(reranker): add benchmark harness and passage clipping

- scripts/benchmark_reranker.py exercises the configured reranker with synthetic queries or live OpenSearch samples and prints p50/p95/p99 latency, mean latency, and pairs/sec throughput. Supports --warmup, --candidates, --passage-length, --source, and a --json-only mode for CI. - app/indexing/reranker.py clips passages to 2048 characters before scoring so a runaway chunk cannot starve the cross-encoder beyond bge-reranker-v2-m3's training window. - RUNBOOK.md gains a Reranker benchmark section with CPU/GPU SLO targets and a remediation ladder (lower top-K, raise batch size, switch device, disable reranker) when measured p95 exceeds budget. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 17:08:04 +03:00
parent f42fb978a8
commit 349f4ea838
3 changed files with 236 additions and 1 deletions
--- a/RUNBOOK.md
+++ b/RUNBOOK.md
@@ -117,6 +117,40 @@ docker compose exec postgres psql -U legacyhub -d legacyhub -c \
   should not be rolled back casually. Restore from backup via the standard
   TeamHUB Suite backup runbook.
 ## Reranker benchmark
 The reranker is the latency-defining stage of the hybrid search path. Run the
 benchmark on every hardware change (CPU vs GPU, instance type, batch size)
 before promoting the configuration.
 ```bash
 # synthetic warmup + 32 queries x 40 candidates, ~700-char passages
 docker compose exec api python scripts/benchmark_reranker.py \
  --queries 32 --candidates 40 --warmup 4
 # real corpus sample (after some documents are indexed)
 docker compose exec api python scripts/benchmark_reranker.py \
  --source opensearch --query "ГОСТ 21.501-93" --candidates 40
 ```
 Target SLOs (subject to revision once staging numbers land):
 | Metric              | CPU target | GPU target |
 |---------------------|-----------:|-----------:|
 | p95 latency / query |  < 700 ms  |  < 120 ms  |
 | Throughput          | > 60 pair/s | > 600 pair/s |
 If the measured p95 exceeds the budget, options in order of preference:
 1. Lower `RERANK_CANDIDATES` (default 40 — reducing to 20 roughly halves work).
 2. Increase `RERANKER_BATCH_SIZE` (memory permitting).
 3. Switch `RERANKER_DEVICE=cuda` and use a GPU-capable image.
 4. Disable reranker (`RERANKER_ENABLED=false`) and accept raw RRF order — the
   API still returns useful results; the `reranked` field reports the truth.
 Passages are clipped to 2048 chars before being fed to the cross-encoder so a
 runaway chunk cannot starve the budget.
 ## Scaling notes (~70k PDFs)
 - Workers horizontally scale: `docker compose up -d --scale worker=8`.
--- a/app/indexing/reranker.py
+++ b/app/indexing/reranker.py
@@ -50,10 +50,16 @@ class Reranker:
    def available(self) -> bool:
        return self._impl is not None and self._model is not None
    # bge-reranker-v2-m3 is trained at 512 tokens; we truncate by chars so the
    # reranker stays inside its budget even when callers forget to limit the
    # candidate text length.
    _MAX_PASSAGE_CHARS = 2048
    def score(self, query: str, passages: Sequence[str]) -> list[float]:
        if not self.available or not passages:
            return [0.0] * len(passages)
-        pairs = [(query, p) for p in passages]
+        clipped = [p[: self._MAX_PASSAGE_CHARS] for p in passages]
        pairs = [(query, p) for p in clipped]
        if self._impl == "flagembedding":
            scores = self._model.compute_score(pairs, batch_size=self.batch_size, normalize=True)  # type: ignore[union-attr]
        else:
--- a/scripts/benchmark_reranker.py
+++ b/scripts/benchmark_reranker.py
@@ -0,0 +1,195 @@
 """Reranker latency / throughput benchmark.
 Measures BGE-reranker-v2-m3 (or whatever ``RERANKER_MODEL`` resolves to)
 against synthetic or live corpus passages and prints the standard set of
 percentiles plus throughput. Use this on staging hardware to verify whether
 the configured device meets the latency budget before committing to a target
 top-K.
 Usage:
  # 1) synthetic warm-up (no DB / OpenSearch needed)
  python scripts/benchmark_reranker.py --queries 32 --candidates 40 \
                                       --passage-length 700 --warmup 4
  # 2) live corpus pull (samples real chunks from OpenSearch)
  python scripts/benchmark_reranker.py --source opensearch \
                                       --query "ГОСТ 21.501-93" \
                                       --candidates 40
 Outputs JSON to stdout and a markdown summary table.
 """
 from __future__ import annotations
 import argparse
 import json
 import statistics
 import sys
 import time
 from dataclasses import asdict, dataclass
 from app.config import settings
 from app.indexing.reranker import get_reranker
 from app.logging_config import configure_logging, get_logger
 configure_logging()
 logger = get_logger(__name__)
@dataclass
 class BenchResult:
    model: str
    device: str
    queries: int
    candidates_per_query: int
    passage_chars: int
    warmup: int
    p50_ms: float
    p95_ms: float
    p99_ms: float
    mean_ms: float
    pairs_per_sec: float
    wall_seconds: float
 def percentile(values: list[float], q: float) -> float:
    if not values:
        return 0.0
    s = sorted(values)
    idx = max(0, min(len(s) - 1, int(round((q / 100.0) * (len(s) - 1)))))
    return s[idx]
 def synthetic_passages(n: int, chars: int) -> list[str]:
    seed = "ГОСТ 21.501-93 определяет правила выполнения архитектурно-строительных рабочих чертежей. "
    base = (seed * ((chars // len(seed)) + 2))[:chars]
    return [f"[{i}] {base}" for i in range(n)]
 def synthetic_queries(n: int) -> list[str]:
    samples = [
        "ГОСТ 21.501-93 рабочие чертежи",
        "класс бетона B25",
        "журнал ремонтов узлов",
        "правила производства земляных работ",
        "схема электропитания корпус 3",
        "контроль качества сварных соединений",
        "регламент технического обслуживания",
    ]
    return [samples[i % len(samples)] for i in range(n)]
 def passages_from_opensearch(query: str, top_k: int) -> list[str]:
    from app.indexing.opensearch_client import get_opensearch
    res = get_opensearch().search(
        index=settings.opensearch_index_chunks,
        body={
            "size": top_k,
            "query": {"multi_match": {"query": query, "fields": ["text", "text.ru", "text.en"]}},
            "_source": ["text"],
        },
        request_timeout=30,
    )
    return [h["_source"]["text"] for h in res["hits"]["hits"] if h["_source"].get("text")]
 def run(
    queries: list[str],
    candidates_per_query: int,
    passage_chars: int,
    warmup: int,
    source: str,
 ) -> BenchResult:
    reranker = get_reranker()
    if not reranker.available:
        print("ERROR: reranker model failed to load", file=sys.stderr)
        sys.exit(2)
    # Warmup so JIT / weight loading does not skew p50.
    if warmup > 0:
        warm = synthetic_passages(candidates_per_query, passage_chars)
        for q in queries[:warmup] or [queries[0]]:
            reranker.score(q, warm)
    latencies_ms: list[float] = []
    pair_count = 0
    t0 = time.perf_counter()
    for q in queries:
        if source == "opensearch":
            passages = passages_from_opensearch(q, candidates_per_query)
            if len(passages) < candidates_per_query:
                passages += synthetic_passages(candidates_per_query - len(passages), passage_chars)
        else:
            passages = synthetic_passages(candidates_per_query, passage_chars)
        start = time.perf_counter()
        reranker.score(q, passages)
        latencies_ms.append((time.perf_counter() - start) * 1000.0)
        pair_count += len(passages)
    wall = time.perf_counter() - t0
    return BenchResult(
        model=reranker.model_name,
        device=reranker.device,
        queries=len(queries),
        candidates_per_query=candidates_per_query,
        passage_chars=passage_chars,
        warmup=warmup,
        p50_ms=percentile(latencies_ms, 50),
        p95_ms=percentile(latencies_ms, 95),
        p99_ms=percentile(latencies_ms, 99),
        mean_ms=statistics.fmean(latencies_ms),
        pairs_per_sec=pair_count / wall if wall > 0 else 0.0,
        wall_seconds=wall,
    )
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--queries", type=int, default=32)
    parser.add_argument("--candidates", type=int, default=settings.rerank_candidates)
    parser.add_argument("--passage-length", type=int, default=700,
                        help="Synthetic passage character length")
    parser.add_argument("--warmup", type=int, default=2)
    parser.add_argument("--source", choices=["synthetic", "opensearch"], default="synthetic")
    parser.add_argument("--query", type=str, default=None,
                        help="Single query to use against OpenSearch (with --source opensearch)")
    parser.add_argument("--json-only", action="store_true")
    args = parser.parse_args()
    if args.source == "opensearch" and args.query:
        queries = [args.query] * args.queries
    else:
        queries = synthetic_queries(args.queries)
    result = run(
        queries=queries,
        candidates_per_query=args.candidates,
        passage_chars=args.passage_length,
        warmup=args.warmup,
        source=args.source,
    )
    payload = asdict(result)
    print(json.dumps(payload, indent=2))
    if not args.json_only:
        print()
        print("| Metric              | Value           |")
        print("|---------------------|-----------------|")
        print(f"| Model               | {result.model} |")
        print(f"| Device              | {result.device} |")
        print(f"| Queries             | {result.queries} |")
        print(f"| Candidates / query  | {result.candidates_per_query} |")
        print(f"| Passage chars       | {result.passage_chars} |")
        print(f"| p50 latency         | {result.p50_ms:.1f} ms |")
        print(f"| p95 latency         | {result.p95_ms:.1f} ms |")
        print(f"| p99 latency         | {result.p99_ms:.1f} ms |")
        print(f"| mean latency        | {result.mean_ms:.1f} ms |")
        print(f"| Throughput          | {result.pairs_per_sec:.1f} pairs/s |")
        print(f"| Wall time           | {result.wall_seconds:.2f} s |")
    return 0
 if __name__ == "__main__":
    sys.exit(main())