LegacyHUB/scripts/generate_synthetic_pdfs.py

"""Generate N synthetic single-page PDFs for load testing the ingest pipeline.

Each PDF carries 4-8 paragraphs of seeded English + Cyrillic text. The
generator embeds text via the standard Helvetica font, which only covers
latin-1 - Cyrillic glyphs render as placeholders. That is acceptable for a
*load* generator: the focus is throughput at scale, not retrieval relevance.
For semantic regression tests, use a real corpus sample instead.

Output directory layout::

  <out>/2025-LOAD/
    legacy_00001.pdf
    legacy_00002.pdf
    ...

Usage:

  python scripts/generate_synthetic_pdfs.py --count 1000 --out /data/input/load
  python scripts/generate_synthetic_pdfs.py --count 100 --out ./tmp --scanned-every 5
"""

from __future__ import annotations

import argparse
import random
import sys
from pathlib import Path

try:
    from pypdf import PdfWriter
except Exception:  # noqa: BLE001
    PdfWriter = None  # type: ignore[assignment]

try:
    import pikepdf
except Exception:  # noqa: BLE001
    pikepdf = None  # type: ignore[assignment]


PAGE_W = 595  # A4 @ 72 dpi (close enough)
PAGE_H = 842

SAMPLE_SENTENCES_RU = [
    "ГОСТ 21.501-93 определяет правила выполнения архитектурно-строительных чертежей.",
    "Класс бетона B25 применяется для несущих конструкций нижних этажей.",
    "Все размеры приведены в миллиметрах, если иное не указано.",
    "Контроль качества сварных соединений выполняется в соответствии с регламентом.",
    "Технологический регламент технического обслуживания пересматривается ежегодно.",
    "При производстве работ при пониженных температурах требуется дополнительное обогрев.",
]
SAMPLE_SENTENCES_EN = [
    "The drawing follows the conventions established in the project specification.",
    "All measurements are reported in SI units and validated against the cited standard.",
    "Service intervals are detailed in the maintenance schedule appended at the back.",
    "Quality control checkpoints precede each acceptance handoff.",
]


def make_text_pdf(path: Path, doc_id: int, rng: random.Random) -> None:
    """Build a real, structurally valid PDF directly via PDF primitives.

    We avoid heavy dependencies (reportlab) for the hot path; pypdf only writes
    the container. Text is embedded as a content stream using the built-in
    Helvetica font.
    """
    if PdfWriter is None:
        raise RuntimeError("pypdf is required (pip install pypdf>=4.3)")

    n_paragraphs = rng.randint(4, 8)
    paragraphs = []
    for _ in range(n_paragraphs):
        sents = rng.sample(SAMPLE_SENTENCES_RU + SAMPLE_SENTENCES_EN,
                           k=rng.randint(2, 4))
        paragraphs.append(" ".join(sents))

    body = f"Legacy archive document #{doc_id}\n\n" + "\n\n".join(paragraphs)
    _write_minimal_pdf(path, body)


def _write_minimal_pdf(path: Path, body: str) -> None:
    """Hand-write a 1-page PDF with Helvetica text. Keeps the file under 4 KB
    so the load generator scales to tens of thousands of documents on a laptop.
    """
    # Escape PDF special chars
    body_escaped = (body.replace("\\", "\\\\")
                        .replace("(", "\\(")
                        .replace(")", "\\)"))
    lines = body_escaped.split("\n")
    leading = 14
    y_start = PAGE_H - 72
    stream_lines = []
    for i, line in enumerate(lines[:50]):  # cap visible lines
        y = y_start - i * leading
        stream_lines.append(f"BT /F1 11 Tf 72 {y} Td ({line}) Tj ET")
    content_stream = "\n".join(stream_lines) + "\n"
    content_bytes = content_stream.encode("latin-1", errors="replace")

    objs = []
    objs.append(b"<< /Type /Catalog /Pages 2 0 R >>")
    objs.append(b"<< /Type /Pages /Count 1 /Kids [3 0 R] >>")
    objs.append(
        f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 5 0 R >> >>"
        f" /MediaBox [0 0 {PAGE_W} {PAGE_H}] /Contents 4 0 R >>".encode("latin-1")
    )
    objs.append(
        b"<< /Length " + str(len(content_bytes)).encode("ascii") + b" >>\nstream\n"
        + content_bytes + b"endstream"
    )
    objs.append(b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>")

    output = bytearray(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n")
    offsets = [0]
    for i, obj in enumerate(objs, start=1):
        offsets.append(len(output))
        output += f"{i} 0 obj\n".encode("ascii") + obj + b"\nendobj\n"
    xref_offset = len(output)
    output += b"xref\n"
    output += f"0 {len(objs) + 1}\n".encode("ascii")
    output += b"0000000000 65535 f \n"
    for off in offsets[1:]:
        output += f"{off:010d} 00000 n \n".encode("ascii")
    output += b"trailer\n"
    output += f"<< /Size {len(objs) + 1} /Root 1 0 R >>\n".encode("ascii")
    output += b"startxref\n"
    output += f"{xref_offset}\n".encode("ascii")
    output += b"%%EOF\n"
    path.write_bytes(bytes(output))


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--count", type=int, required=True)
    parser.add_argument("--out", type=Path, required=True)
    parser.add_argument("--seed", type=int, default=20260513)
    args = parser.parse_args()

    args.out.mkdir(parents=True, exist_ok=True)
    rng = random.Random(args.seed)

    for i in range(1, args.count + 1):
        target = args.out / f"legacy_{i:06d}.pdf"
        make_text_pdf(target, i, rng)
        if i % 500 == 0:
            print(f"  generated {i}/{args.count}")
    print(f"done: {args.count} files in {args.out}")
    return 0


if __name__ == "__main__":
    sys.exit(main())