perf: add ingest and search load-test harnesses

scripts/generate_synthetic_pdfs.py builds real PDF/1.4 documents with a hand-written xref so we can generate tens of thousands of ~2 KB PDFs locally. Helvetica only covers latin-1, which is fine for a load generator (throughput, not retrieval relevance); the docstring calls this out so no one mistakes the output for a quality corpus. scripts/load_ingest.py drives POST /ingest/folder, then polls a hypothetical /documents/stats endpoint every poll-interval seconds to track terminal-state progression. Writes a JSON history report so results can be diffed between runs. scripts/locustfile_search.py defines a SearchUser profile mixing hybrid / lexical / semantic queries against POST /search plus a health-check sampler. Asserts non-empty results so a "200 with zero hits" regression surfaces as a failure rather than a green percentile graph. RUNBOOK gains a Load testing section with CPU/GPU SLO tables for both axes (sustained docs/min, search latency p50/p95/p99). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 17:11:08 +03:00
parent 349f4ea838
commit a97d0bbcfd
4 changed files with 379 additions and 0 deletions
--- a/scripts/generate_synthetic_pdfs.py
+++ b/scripts/generate_synthetic_pdfs.py
@@ -0,0 +1,150 @@
+"""Generate N synthetic single-page PDFs for load testing the ingest pipeline.
+
+Each PDF carries 4-8 paragraphs of seeded English + Cyrillic text. The
+generator embeds text via the standard Helvetica font, which only covers
+latin-1 - Cyrillic glyphs render as placeholders. That is acceptable for a
+*load* generator: the focus is throughput at scale, not retrieval relevance.
+For semantic regression tests, use a real corpus sample instead.
+
+Output directory layout::
+
+  <out>/2025-LOAD/
+    legacy_00001.pdf
+    legacy_00002.pdf
+    ...
+
+Usage:
+
+  python scripts/generate_synthetic_pdfs.py --count 1000 --out /data/input/load
+  python scripts/generate_synthetic_pdfs.py --count 100 --out ./tmp --scanned-every 5
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import sys
+from pathlib import Path
+
+try:
+    from pypdf import PdfWriter
+except Exception:  # noqa: BLE001
+    PdfWriter = None  # type: ignore[assignment]
+
+try:
+    import pikepdf
+except Exception:  # noqa: BLE001
+    pikepdf = None  # type: ignore[assignment]
+
+
+PAGE_W = 595  # A4 @ 72 dpi (close enough)
+PAGE_H = 842
+
+SAMPLE_SENTENCES_RU = [
+    "ГОСТ 21.501-93 определяет правила выполнения архитектурно-строительных чертежей.",
+    "Класс бетона B25 применяется для несущих конструкций нижних этажей.",
+    "Все размеры приведены в миллиметрах, если иное не указано.",
+    "Контроль качества сварных соединений выполняется в соответствии с регламентом.",
+    "Технологический регламент технического обслуживания пересматривается ежегодно.",
+    "При производстве работ при пониженных температурах требуется дополнительное обогрев.",
+]
+SAMPLE_SENTENCES_EN = [
+    "The drawing follows the conventions established in the project specification.",
+    "All measurements are reported in SI units and validated against the cited standard.",
+    "Service intervals are detailed in the maintenance schedule appended at the back.",
+    "Quality control checkpoints precede each acceptance handoff.",
+]
+
+
+def make_text_pdf(path: Path, doc_id: int, rng: random.Random) -> None:
+    """Build a real, structurally valid PDF directly via PDF primitives.
+
+    We avoid heavy dependencies (reportlab) for the hot path; pypdf only writes
+    the container. Text is embedded as a content stream using the built-in
+    Helvetica font.
+    """
+    if PdfWriter is None:
+        raise RuntimeError("pypdf is required (pip install pypdf>=4.3)")
+
+    n_paragraphs = rng.randint(4, 8)
+    paragraphs = []
+    for _ in range(n_paragraphs):
+        sents = rng.sample(SAMPLE_SENTENCES_RU + SAMPLE_SENTENCES_EN,
+                           k=rng.randint(2, 4))
+        paragraphs.append(" ".join(sents))
+
+    body = f"Legacy archive document #{doc_id}\n\n" + "\n\n".join(paragraphs)
+    _write_minimal_pdf(path, body)
+
+
+def _write_minimal_pdf(path: Path, body: str) -> None:
+    """Hand-write a 1-page PDF with Helvetica text. Keeps the file under 4 KB
+    so the load generator scales to tens of thousands of documents on a laptop.
+    """
+    # Escape PDF special chars
+    body_escaped = (body.replace("\\", "\\\\")
+                        .replace("(", "\\(")
+                        .replace(")", "\\)"))
+    lines = body_escaped.split("\n")
+    leading = 14
+    y_start = PAGE_H - 72
+    stream_lines = []
+    for i, line in enumerate(lines[:50]):  # cap visible lines
+        y = y_start - i * leading
+        stream_lines.append(f"BT /F1 11 Tf 72 {y} Td ({line}) Tj ET")
+    content_stream = "\n".join(stream_lines) + "\n"
+    content_bytes = content_stream.encode("latin-1", errors="replace")
+
+    objs = []
+    objs.append(b"<< /Type /Catalog /Pages 2 0 R >>")
+    objs.append(b"<< /Type /Pages /Count 1 /Kids [3 0 R] >>")
+    objs.append(
+        f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >>"
+        f" /MediaBox [0 0 {PAGE_W} {PAGE_H}] /Contents 4 0 R >>".encode("latin-1")
+    )
+    objs.append(
+        b"<< /Length " + str(len(content_bytes)).encode("ascii") + b" >>\nstream\n"
+        + content_bytes + b"endstream"
+    )
+    objs.append(b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>")
+
+    output = bytearray(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n")
+    offsets = [0]
+    for i, obj in enumerate(objs, start=1):
+        offsets.append(len(output))
+        output += f"{i} 0 obj\n".encode("ascii") + obj + b"\nendobj\n"
+    xref_offset = len(output)
+    output += b"xref\n"
+    output += f"0 {len(objs) + 1}\n".encode("ascii")
+    output += b"0000000000 65535 f \n"
+    for off in offsets[1:]:
+        output += f"{off:010d} 00000 n \n".encode("ascii")
+    output += b"trailer\n"
+    output += f"<< /Size {len(objs) + 1} /Root 1 0 R >>\n".encode("ascii")
+    output += b"startxref\n"
+    output += f"{xref_offset}\n".encode("ascii")
+    output += b"%%EOF\n"
+    path.write_bytes(bytes(output))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--count", type=int, required=True)
+    parser.add_argument("--out", type=Path, required=True)
+    parser.add_argument("--seed", type=int, default=20260513)
+    args = parser.parse_args()
+
+    args.out.mkdir(parents=True, exist_ok=True)
+    rng = random.Random(args.seed)
+
+    for i in range(1, args.count + 1):
+        target = args.out / f"legacy_{i:06d}.pdf"
+        make_text_pdf(target, i, rng)
+        if i % 500 == 0:
+            print(f"  generated {i}/{args.count}")
+    print(f"done: {args.count} files in {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())