Files
LegacyHUB/app/config.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

112 lines
5.0 KiB
Python

"""Centralized typed configuration loaded from environment variables.
All other modules import :data:`settings` and never touch ``os.environ`` directly.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ---------------- App ----------------
app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
app_host: str = Field("0.0.0.0", alias="APP_HOST")
app_port: int = Field(8000, alias="APP_PORT")
app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
# ---------------- Postgres ----------------
postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
postgres_port: int = Field(5432, alias="POSTGRES_PORT")
postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
@property
def database_url(self) -> str:
return (
f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
)
# ---------------- MinIO ----------------
minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
minio_secure: bool = Field(False, alias="MINIO_SECURE")
minio_region: str = Field("us-east-1", alias="MINIO_REGION")
# ---------------- OpenSearch ----------------
opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
opensearch_user: str = Field("", alias="OPENSEARCH_USER")
opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
# ---------------- Qdrant ----------------
qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
qdrant_port: int = Field(6333, alias="QDRANT_PORT")
qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
# ---------------- Redis ----------------
redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
# ---------------- OCR ----------------
ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
ocr_clean: bool = Field(True, alias="OCR_CLEAN")
ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
# ---------------- Embeddings / Reranker ----------------
embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
# ---------------- Chunking ----------------
chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
# ---------------- Hybrid search ----------------
hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
@lru_cache(maxsize=1)
def get_settings() -> Settings:
return Settings() # type: ignore[call-arg]
settings = get_settings()