chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

0
scripts/__init__.py Normal file
View File

60
scripts/ingest_folder.py Normal file
View File

@@ -0,0 +1,60 @@
"""Synchronous CLI ingestion: discover -> queue -> process inline.
Use ``--async`` to push tasks to Celery instead of running inline (default
inline mode is convenient for ad-hoc runs without a worker container).
"""
from __future__ import annotations
import sys
import uuid
from pathlib import Path
import click
from app.ingestion.scanner import discover_documents
from app.logging_config import configure_logging, get_logger
configure_logging()
logger = get_logger(__name__)
@click.command()
@click.option("--path", required=True, type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=Path))
@click.option("--recursive/--no-recursive", default=True)
@click.option("--force", is_flag=True, default=False, help="Re-process even if SHA already exists")
@click.option("--mode", type=click.Choice(["inline", "celery"]), default="inline")
def main(path: Path, recursive: bool, force: bool, mode: str) -> None:
run_id = uuid.uuid4()
discovered = queued = dups = invalid = 0
for record in discover_documents(path, recursive=recursive, force=force):
discovered += 1
if record.duplicate and not force:
dups += 1
continue
if not record.document_id:
invalid += 1
continue
if mode == "celery":
from app.workers.tasks import process_document
process_document.delay(str(record.document_id), str(run_id))
else:
from app.ingestion.pipeline import process_document_id
try:
result = process_document_id(record.document_id, run_id)
logger.info("ingest.cli.processed", path=str(record.path), result=result)
except Exception as exc: # noqa: BLE001
logger.exception("ingest.cli.failed", path=str(record.path), error=str(exc))
invalid += 1
continue
queued += 1
click.echo(
f"discovered={discovered} queued={queued} duplicates={dups} invalid={invalid} run={run_id}"
)
if __name__ == "__main__":
sys.exit(main(standalone_mode=True) or 0)

25
scripts/init_db.py Normal file
View File

@@ -0,0 +1,25 @@
"""Apply Alembic migrations against the configured Postgres."""
from __future__ import annotations
import sys
from pathlib import Path
from alembic import command
from alembic.config import Config
from app.config import settings
def main() -> int:
root = Path(__file__).resolve().parents[1]
cfg = Config(str(root / "alembic.ini"))
cfg.set_main_option("script_location", str(root / "app" / "db" / "migrations"))
cfg.set_main_option("sqlalchemy.url", settings.database_url)
command.upgrade(cfg, "head")
print("alembic upgrade head: ok")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,17 @@
"""Bootstrap the OpenSearch chunk index."""
from __future__ import annotations
import sys
from app.indexing.opensearch_client import ensure_index
def main() -> int:
ensure_index()
print("opensearch index ensured")
return 0
if __name__ == "__main__":
sys.exit(main())

17
scripts/init_qdrant.py Normal file
View File

@@ -0,0 +1,17 @@
"""Bootstrap the Qdrant chunk collection."""
from __future__ import annotations
import sys
from app.indexing.qdrant_client import ensure_collection
def main() -> int:
ensure_collection()
print("qdrant collection ensured")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,24 @@
"""Re-run the pipeline for a single document by ID."""
from __future__ import annotations
import sys
import uuid
import click
from app.ingestion.pipeline import process_document_id
from app.logging_config import configure_logging
configure_logging()
@click.command()
@click.option("--document-id", required=True, type=str)
def main(document_id: str) -> None:
result = process_document_id(uuid.UUID(document_id))
click.echo(result)
if __name__ == "__main__":
sys.exit(main(standalone_mode=True) or 0)

74
scripts/smoke_test.py Normal file
View File

@@ -0,0 +1,74 @@
"""Smoke test - verify all infrastructure is reachable and indices are present.
Exits non-zero on first hard error.
"""
from __future__ import annotations
import sys
from sqlalchemy import text
from app.db.session import get_engine
from app.indexing.opensearch_client import ensure_index, get_opensearch
from app.indexing.qdrant_client import ensure_collection, get_qdrant
from app.logging_config import configure_logging, get_logger
from app.storage.minio_client import get_storage
configure_logging()
logger = get_logger(__name__)
def main() -> int:
failures: list[str] = []
# Postgres
try:
with get_engine().connect() as conn:
conn.execute(text("SELECT 1"))
print("[ok] postgres")
except Exception as exc: # noqa: BLE001
failures.append(f"postgres: {exc}")
print(f"[err] postgres: {exc}")
# MinIO
try:
s = get_storage()
s.ensure_buckets()
info = s.health()
if info.get("status") != "ok":
raise RuntimeError(info)
print("[ok] minio:", info.get("buckets"))
except Exception as exc: # noqa: BLE001
failures.append(f"minio: {exc}")
print(f"[err] minio: {exc}")
# OpenSearch
try:
ensure_index()
info = get_opensearch().cluster.health()
print("[ok] opensearch:", info.get("status"))
except Exception as exc: # noqa: BLE001
failures.append(f"opensearch: {exc}")
print(f"[err] opensearch: {exc}")
# Qdrant
try:
ensure_collection()
cols = [c.name for c in get_qdrant().get_collections().collections]
print("[ok] qdrant collections:", cols)
except Exception as exc: # noqa: BLE001
failures.append(f"qdrant: {exc}")
print(f"[err] qdrant: {exc}")
if failures:
print("\nSMOKE FAIL:")
for f in failures:
print(" -", f)
return 1
print("\nSMOKE OK")
return 0
if __name__ == "__main__":
sys.exit(main())