Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
112 lines
5.0 KiB
Python
112 lines
5.0 KiB
Python
"""Centralized typed configuration loaded from environment variables.
|
|
|
|
All other modules import :data:`settings` and never touch ``os.environ`` directly.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from functools import lru_cache
|
|
from typing import Literal
|
|
|
|
from pydantic import Field
|
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
model_config = SettingsConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
)
|
|
|
|
# ---------------- App ----------------
|
|
app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
|
|
app_host: str = Field("0.0.0.0", alias="APP_HOST")
|
|
app_port: int = Field(8000, alias="APP_PORT")
|
|
app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
|
|
app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
|
|
app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
|
|
|
|
# ---------------- Postgres ----------------
|
|
postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
|
|
postgres_port: int = Field(5432, alias="POSTGRES_PORT")
|
|
postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
|
|
postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
|
|
postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
|
|
|
|
@property
|
|
def database_url(self) -> str:
|
|
return (
|
|
f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
|
|
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
|
|
)
|
|
|
|
# ---------------- MinIO ----------------
|
|
minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
|
|
minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
|
|
minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
|
|
minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
|
|
minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
|
|
minio_secure: bool = Field(False, alias="MINIO_SECURE")
|
|
minio_region: str = Field("us-east-1", alias="MINIO_REGION")
|
|
|
|
# ---------------- OpenSearch ----------------
|
|
opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
|
|
opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
|
|
opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
|
|
opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
|
|
opensearch_user: str = Field("", alias="OPENSEARCH_USER")
|
|
opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
|
|
opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
|
|
|
|
# ---------------- Qdrant ----------------
|
|
qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
|
|
qdrant_port: int = Field(6333, alias="QDRANT_PORT")
|
|
qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
|
|
qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
|
|
|
|
# ---------------- Redis ----------------
|
|
redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
|
|
|
|
# ---------------- OCR ----------------
|
|
ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
|
|
ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
|
|
docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
|
|
max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
|
|
ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
|
|
ocr_clean: bool = Field(True, alias="OCR_CLEAN")
|
|
ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
|
|
|
|
# ---------------- Embeddings / Reranker ----------------
|
|
embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
|
|
embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
|
|
embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
|
|
embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
|
|
embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
|
|
|
|
reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
|
|
reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
|
|
reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
|
|
reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
|
|
|
|
# ---------------- Chunking ----------------
|
|
chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
|
|
chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
|
|
chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
|
|
chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
|
|
|
|
# ---------------- Hybrid search ----------------
|
|
hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
|
|
hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
|
|
hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
|
|
rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_settings() -> Settings:
|
|
return Settings() # type: ignore[call-arg]
|
|
|
|
|
|
settings = get_settings()
|