chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

111
app/config.py Normal file
View File

@@ -0,0 +1,111 @@
"""Centralized typed configuration loaded from environment variables.
All other modules import :data:`settings` and never touch ``os.environ`` directly.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ---------------- App ----------------
app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
app_host: str = Field("0.0.0.0", alias="APP_HOST")
app_port: int = Field(8000, alias="APP_PORT")
app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
# ---------------- Postgres ----------------
postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
postgres_port: int = Field(5432, alias="POSTGRES_PORT")
postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
@property
def database_url(self) -> str:
return (
f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
)
# ---------------- MinIO ----------------
minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
minio_secure: bool = Field(False, alias="MINIO_SECURE")
minio_region: str = Field("us-east-1", alias="MINIO_REGION")
# ---------------- OpenSearch ----------------
opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
opensearch_user: str = Field("", alias="OPENSEARCH_USER")
opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
# ---------------- Qdrant ----------------
qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
qdrant_port: int = Field(6333, alias="QDRANT_PORT")
qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
# ---------------- Redis ----------------
redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
# ---------------- OCR ----------------
ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
ocr_clean: bool = Field(True, alias="OCR_CLEAN")
ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
# ---------------- Embeddings / Reranker ----------------
embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
# ---------------- Chunking ----------------
chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
# ---------------- Hybrid search ----------------
hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
@lru_cache(maxsize=1)
def get_settings() -> Settings:
return Settings() # type: ignore[call-arg]
settings = get_settings()