scripts/generate_synthetic_pdfs.py builds real PDF/1.4 documents with a hand-written xref so we can generate tens of thousands of ~2 KB PDFs locally. Helvetica only covers latin-1, which is fine for a load generator (throughput, not retrieval relevance); the docstring calls this out so no one mistakes the output for a quality corpus. scripts/load_ingest.py drives POST /ingest/folder, then polls a hypothetical /documents/stats endpoint every poll-interval seconds to track terminal-state progression. Writes a JSON history report so results can be diffed between runs. scripts/locustfile_search.py defines a SearchUser profile mixing hybrid / lexical / semantic queries against POST /search plus a health-check sampler. Asserts non-empty results so a "200 with zero hits" regression surfaces as a failure rather than a green percentile graph. RUNBOOK gains a Load testing section with CPU/GPU SLO tables for both axes (sustained docs/min, search latency p50/p95/p99). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
151 lines
5.7 KiB
Python
151 lines
5.7 KiB
Python
"""Generate N synthetic single-page PDFs for load testing the ingest pipeline.
|
||
|
||
Each PDF carries 4-8 paragraphs of seeded English + Cyrillic text. The
|
||
generator embeds text via the standard Helvetica font, which only covers
|
||
latin-1 - Cyrillic glyphs render as placeholders. That is acceptable for a
|
||
*load* generator: the focus is throughput at scale, not retrieval relevance.
|
||
For semantic regression tests, use a real corpus sample instead.
|
||
|
||
Output directory layout::
|
||
|
||
<out>/2025-LOAD/
|
||
legacy_00001.pdf
|
||
legacy_00002.pdf
|
||
...
|
||
|
||
Usage:
|
||
|
||
python scripts/generate_synthetic_pdfs.py --count 1000 --out /data/input/load
|
||
python scripts/generate_synthetic_pdfs.py --count 100 --out ./tmp --scanned-every 5
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import random
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
try:
|
||
from pypdf import PdfWriter
|
||
except Exception: # noqa: BLE001
|
||
PdfWriter = None # type: ignore[assignment]
|
||
|
||
try:
|
||
import pikepdf
|
||
except Exception: # noqa: BLE001
|
||
pikepdf = None # type: ignore[assignment]
|
||
|
||
|
||
PAGE_W = 595 # A4 @ 72 dpi (close enough)
|
||
PAGE_H = 842
|
||
|
||
SAMPLE_SENTENCES_RU = [
|
||
"ГОСТ 21.501-93 определяет правила выполнения архитектурно-строительных чертежей.",
|
||
"Класс бетона B25 применяется для несущих конструкций нижних этажей.",
|
||
"Все размеры приведены в миллиметрах, если иное не указано.",
|
||
"Контроль качества сварных соединений выполняется в соответствии с регламентом.",
|
||
"Технологический регламент технического обслуживания пересматривается ежегодно.",
|
||
"При производстве работ при пониженных температурах требуется дополнительное обогрев.",
|
||
]
|
||
SAMPLE_SENTENCES_EN = [
|
||
"The drawing follows the conventions established in the project specification.",
|
||
"All measurements are reported in SI units and validated against the cited standard.",
|
||
"Service intervals are detailed in the maintenance schedule appended at the back.",
|
||
"Quality control checkpoints precede each acceptance handoff.",
|
||
]
|
||
|
||
|
||
def make_text_pdf(path: Path, doc_id: int, rng: random.Random) -> None:
|
||
"""Build a real, structurally valid PDF directly via PDF primitives.
|
||
|
||
We avoid heavy dependencies (reportlab) for the hot path; pypdf only writes
|
||
the container. Text is embedded as a content stream using the built-in
|
||
Helvetica font.
|
||
"""
|
||
if PdfWriter is None:
|
||
raise RuntimeError("pypdf is required (pip install pypdf>=4.3)")
|
||
|
||
n_paragraphs = rng.randint(4, 8)
|
||
paragraphs = []
|
||
for _ in range(n_paragraphs):
|
||
sents = rng.sample(SAMPLE_SENTENCES_RU + SAMPLE_SENTENCES_EN,
|
||
k=rng.randint(2, 4))
|
||
paragraphs.append(" ".join(sents))
|
||
|
||
body = f"Legacy archive document #{doc_id}\n\n" + "\n\n".join(paragraphs)
|
||
_write_minimal_pdf(path, body)
|
||
|
||
|
||
def _write_minimal_pdf(path: Path, body: str) -> None:
|
||
"""Hand-write a 1-page PDF with Helvetica text. Keeps the file under 4 KB
|
||
so the load generator scales to tens of thousands of documents on a laptop.
|
||
"""
|
||
# Escape PDF special chars
|
||
body_escaped = (body.replace("\\", "\\\\")
|
||
.replace("(", "\\(")
|
||
.replace(")", "\\)"))
|
||
lines = body_escaped.split("\n")
|
||
leading = 14
|
||
y_start = PAGE_H - 72
|
||
stream_lines = []
|
||
for i, line in enumerate(lines[:50]): # cap visible lines
|
||
y = y_start - i * leading
|
||
stream_lines.append(f"BT /F1 11 Tf 72 {y} Td ({line}) Tj ET")
|
||
content_stream = "\n".join(stream_lines) + "\n"
|
||
content_bytes = content_stream.encode("latin-1", errors="replace")
|
||
|
||
objs = []
|
||
objs.append(b"<< /Type /Catalog /Pages 2 0 R >>")
|
||
objs.append(b"<< /Type /Pages /Count 1 /Kids [3 0 R] >>")
|
||
objs.append(
|
||
f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >>"
|
||
f" /MediaBox [0 0 {PAGE_W} {PAGE_H}] /Contents 4 0 R >>".encode("latin-1")
|
||
)
|
||
objs.append(
|
||
b"<< /Length " + str(len(content_bytes)).encode("ascii") + b" >>\nstream\n"
|
||
+ content_bytes + b"endstream"
|
||
)
|
||
objs.append(b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>")
|
||
|
||
output = bytearray(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n")
|
||
offsets = [0]
|
||
for i, obj in enumerate(objs, start=1):
|
||
offsets.append(len(output))
|
||
output += f"{i} 0 obj\n".encode("ascii") + obj + b"\nendobj\n"
|
||
xref_offset = len(output)
|
||
output += b"xref\n"
|
||
output += f"0 {len(objs) + 1}\n".encode("ascii")
|
||
output += b"0000000000 65535 f \n"
|
||
for off in offsets[1:]:
|
||
output += f"{off:010d} 00000 n \n".encode("ascii")
|
||
output += b"trailer\n"
|
||
output += f"<< /Size {len(objs) + 1} /Root 1 0 R >>\n".encode("ascii")
|
||
output += b"startxref\n"
|
||
output += f"{xref_offset}\n".encode("ascii")
|
||
output += b"%%EOF\n"
|
||
path.write_bytes(bytes(output))
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument("--count", type=int, required=True)
|
||
parser.add_argument("--out", type=Path, required=True)
|
||
parser.add_argument("--seed", type=int, default=20260513)
|
||
args = parser.parse_args()
|
||
|
||
args.out.mkdir(parents=True, exist_ok=True)
|
||
rng = random.Random(args.seed)
|
||
|
||
for i in range(1, args.count + 1):
|
||
target = args.out / f"legacy_{i:06d}.pdf"
|
||
make_text_pdf(target, i, rng)
|
||
if i % 500 == 0:
|
||
print(f" generated {i}/{args.count}")
|
||
print(f"done: {args.count} files in {args.out}")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|