perf: add ingest and search load-test harnesses
scripts/generate_synthetic_pdfs.py builds real PDF/1.4 documents with a hand-written xref so we can generate tens of thousands of ~2 KB PDFs locally. Helvetica only covers latin-1, which is fine for a load generator (throughput, not retrieval relevance); the docstring calls this out so no one mistakes the output for a quality corpus. scripts/load_ingest.py drives POST /ingest/folder, then polls a hypothetical /documents/stats endpoint every poll-interval seconds to track terminal-state progression. Writes a JSON history report so results can be diffed between runs. scripts/locustfile_search.py defines a SearchUser profile mixing hybrid / lexical / semantic queries against POST /search plus a health-check sampler. Asserts non-empty results so a "200 with zero hits" regression surfaces as a failure rather than a green percentile graph. RUNBOOK gains a Load testing section with CPU/GPU SLO tables for both axes (sustained docs/min, search latency p50/p95/p99). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
150
scripts/generate_synthetic_pdfs.py
Normal file
150
scripts/generate_synthetic_pdfs.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Generate N synthetic single-page PDFs for load testing the ingest pipeline.
|
||||
|
||||
Each PDF carries 4-8 paragraphs of seeded English + Cyrillic text. The
|
||||
generator embeds text via the standard Helvetica font, which only covers
|
||||
latin-1 - Cyrillic glyphs render as placeholders. That is acceptable for a
|
||||
*load* generator: the focus is throughput at scale, not retrieval relevance.
|
||||
For semantic regression tests, use a real corpus sample instead.
|
||||
|
||||
Output directory layout::
|
||||
|
||||
<out>/2025-LOAD/
|
||||
legacy_00001.pdf
|
||||
legacy_00002.pdf
|
||||
...
|
||||
|
||||
Usage:
|
||||
|
||||
python scripts/generate_synthetic_pdfs.py --count 1000 --out /data/input/load
|
||||
python scripts/generate_synthetic_pdfs.py --count 100 --out ./tmp --scanned-every 5
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from pypdf import PdfWriter
|
||||
except Exception: # noqa: BLE001
|
||||
PdfWriter = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pikepdf
|
||||
except Exception: # noqa: BLE001
|
||||
pikepdf = None # type: ignore[assignment]
|
||||
|
||||
|
||||
PAGE_W = 595 # A4 @ 72 dpi (close enough)
|
||||
PAGE_H = 842
|
||||
|
||||
SAMPLE_SENTENCES_RU = [
|
||||
"ГОСТ 21.501-93 определяет правила выполнения архитектурно-строительных чертежей.",
|
||||
"Класс бетона B25 применяется для несущих конструкций нижних этажей.",
|
||||
"Все размеры приведены в миллиметрах, если иное не указано.",
|
||||
"Контроль качества сварных соединений выполняется в соответствии с регламентом.",
|
||||
"Технологический регламент технического обслуживания пересматривается ежегодно.",
|
||||
"При производстве работ при пониженных температурах требуется дополнительное обогрев.",
|
||||
]
|
||||
SAMPLE_SENTENCES_EN = [
|
||||
"The drawing follows the conventions established in the project specification.",
|
||||
"All measurements are reported in SI units and validated against the cited standard.",
|
||||
"Service intervals are detailed in the maintenance schedule appended at the back.",
|
||||
"Quality control checkpoints precede each acceptance handoff.",
|
||||
]
|
||||
|
||||
|
||||
def make_text_pdf(path: Path, doc_id: int, rng: random.Random) -> None:
|
||||
"""Build a real, structurally valid PDF directly via PDF primitives.
|
||||
|
||||
We avoid heavy dependencies (reportlab) for the hot path; pypdf only writes
|
||||
the container. Text is embedded as a content stream using the built-in
|
||||
Helvetica font.
|
||||
"""
|
||||
if PdfWriter is None:
|
||||
raise RuntimeError("pypdf is required (pip install pypdf>=4.3)")
|
||||
|
||||
n_paragraphs = rng.randint(4, 8)
|
||||
paragraphs = []
|
||||
for _ in range(n_paragraphs):
|
||||
sents = rng.sample(SAMPLE_SENTENCES_RU + SAMPLE_SENTENCES_EN,
|
||||
k=rng.randint(2, 4))
|
||||
paragraphs.append(" ".join(sents))
|
||||
|
||||
body = f"Legacy archive document #{doc_id}\n\n" + "\n\n".join(paragraphs)
|
||||
_write_minimal_pdf(path, body)
|
||||
|
||||
|
||||
def _write_minimal_pdf(path: Path, body: str) -> None:
|
||||
"""Hand-write a 1-page PDF with Helvetica text. Keeps the file under 4 KB
|
||||
so the load generator scales to tens of thousands of documents on a laptop.
|
||||
"""
|
||||
# Escape PDF special chars
|
||||
body_escaped = (body.replace("\\", "\\\\")
|
||||
.replace("(", "\\(")
|
||||
.replace(")", "\\)"))
|
||||
lines = body_escaped.split("\n")
|
||||
leading = 14
|
||||
y_start = PAGE_H - 72
|
||||
stream_lines = []
|
||||
for i, line in enumerate(lines[:50]): # cap visible lines
|
||||
y = y_start - i * leading
|
||||
stream_lines.append(f"BT /F1 11 Tf 72 {y} Td ({line}) Tj ET")
|
||||
content_stream = "\n".join(stream_lines) + "\n"
|
||||
content_bytes = content_stream.encode("latin-1", errors="replace")
|
||||
|
||||
objs = []
|
||||
objs.append(b"<< /Type /Catalog /Pages 2 0 R >>")
|
||||
objs.append(b"<< /Type /Pages /Count 1 /Kids [3 0 R] >>")
|
||||
objs.append(
|
||||
f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >>"
|
||||
f" /MediaBox [0 0 {PAGE_W} {PAGE_H}] /Contents 4 0 R >>".encode("latin-1")
|
||||
)
|
||||
objs.append(
|
||||
b"<< /Length " + str(len(content_bytes)).encode("ascii") + b" >>\nstream\n"
|
||||
+ content_bytes + b"endstream"
|
||||
)
|
||||
objs.append(b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>")
|
||||
|
||||
output = bytearray(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n")
|
||||
offsets = [0]
|
||||
for i, obj in enumerate(objs, start=1):
|
||||
offsets.append(len(output))
|
||||
output += f"{i} 0 obj\n".encode("ascii") + obj + b"\nendobj\n"
|
||||
xref_offset = len(output)
|
||||
output += b"xref\n"
|
||||
output += f"0 {len(objs) + 1}\n".encode("ascii")
|
||||
output += b"0000000000 65535 f \n"
|
||||
for off in offsets[1:]:
|
||||
output += f"{off:010d} 00000 n \n".encode("ascii")
|
||||
output += b"trailer\n"
|
||||
output += f"<< /Size {len(objs) + 1} /Root 1 0 R >>\n".encode("ascii")
|
||||
output += b"startxref\n"
|
||||
output += f"{xref_offset}\n".encode("ascii")
|
||||
output += b"%%EOF\n"
|
||||
path.write_bytes(bytes(output))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--count", type=int, required=True)
|
||||
parser.add_argument("--out", type=Path, required=True)
|
||||
parser.add_argument("--seed", type=int, default=20260513)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.out.mkdir(parents=True, exist_ok=True)
|
||||
rng = random.Random(args.seed)
|
||||
|
||||
for i in range(1, args.count + 1):
|
||||
target = args.out / f"legacy_{i:06d}.pdf"
|
||||
make_text_pdf(target, i, rng)
|
||||
if i % 500 == 0:
|
||||
print(f" generated {i}/{args.count}")
|
||||
print(f"done: {args.count} files in {args.out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user