chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

0
app/api/__init__.py Normal file
View File

96
app/api/routes_health.py Normal file
View File

@@ -0,0 +1,96 @@
"""Health endpoint - probes Postgres, MinIO, OpenSearch, Qdrant, Redis."""
from __future__ import annotations
from typing import Any
from fastapi import APIRouter
from sqlalchemy import text
from app import __version__
from app.api.schemas import ComponentHealth, HealthResponse
from app.config import settings
from app.db.session import get_engine
from app.logging_config import get_logger
from app.storage.minio_client import get_storage
logger = get_logger(__name__)
router = APIRouter(tags=["health"])
def _check_postgres() -> ComponentHealth:
try:
with get_engine().connect() as conn:
conn.execute(text("SELECT 1"))
return ComponentHealth(name="postgres", status="ok")
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="postgres", status="error", detail={"error": str(exc)})
def _check_minio() -> ComponentHealth:
info: dict[str, Any] = get_storage().health()
if info.get("status") == "ok":
return ComponentHealth(name="minio", status="ok", detail=info)
return ComponentHealth(name="minio", status="error", detail=info)
def _check_opensearch() -> ComponentHealth:
try:
from app.indexing.opensearch_client import get_opensearch
client = get_opensearch()
info = client.cluster.health()
cluster_status = info.get("status")
status = "ok" if cluster_status in ("green", "yellow") else "degraded"
return ComponentHealth(
name="opensearch",
status=status, # type: ignore[arg-type]
detail={"cluster_status": cluster_status, "nodes": info.get("number_of_nodes")},
)
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="opensearch", status="error", detail={"error": str(exc)})
def _check_qdrant() -> ComponentHealth:
try:
from app.indexing.qdrant_client import get_qdrant
client = get_qdrant()
cols = client.get_collections()
return ComponentHealth(
name="qdrant",
status="ok",
detail={"collections": [c.name for c in cols.collections]},
)
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="qdrant", status="error", detail={"error": str(exc)})
def _check_redis() -> ComponentHealth:
try:
import redis
r = redis.Redis.from_url(settings.redis_url, socket_connect_timeout=2)
r.ping()
return ComponentHealth(name="redis", status="ok")
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="redis", status="error", detail={"error": str(exc)})
@router.get("/health", response_model=HealthResponse)
def health() -> HealthResponse:
components = [
_check_postgres(),
_check_minio(),
_check_opensearch(),
_check_qdrant(),
_check_redis(),
]
if any(c.status == "error" for c in components):
overall = "error"
elif any(c.status == "degraded" for c in components):
overall = "degraded"
else:
overall = "ok"
return HealthResponse(status=overall, version=__version__, components=components) # type: ignore[arg-type]

View File

@@ -0,0 +1,63 @@
"""Ingestion endpoints."""
from __future__ import annotations
import uuid
from pathlib import Path
from fastapi import APIRouter, HTTPException
from app.api.schemas import IngestFolderRequest, IngestFolderResponse
from app.logging_config import get_logger
logger = get_logger(__name__)
router = APIRouter(prefix="/ingest", tags=["ingestion"])
@router.post("/folder", response_model=IngestFolderResponse)
def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
"""Discover all PDFs under ``path`` and queue them for processing.
The request returns immediately after the discovery pass. Per-document
OCR / extraction / indexing happens asynchronously in Celery workers.
"""
folder = Path(req.path)
if not folder.exists() or not folder.is_dir():
raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
# Lazy import - keeps module load light.
from app.ingestion.scanner import discover_documents
from app.workers.tasks import process_document
run_id = uuid.uuid4()
discovered, queued, dups, invalid = 0, 0, 0, 0
for record in discover_documents(folder, recursive=req.recursive, force=req.force):
discovered += 1
if record.duplicate and not req.force:
dups += 1
continue
if not record.document_id:
invalid += 1
continue
process_document.delay(str(record.document_id), str(run_id))
queued += 1
logger.info(
"ingest.folder.queued",
path=str(folder),
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid=invalid,
run_id=str(run_id),
)
return IngestFolderResponse(
run_id=run_id,
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid_files=invalid,
)

16
app/api/routes_search.py Normal file
View File

@@ -0,0 +1,16 @@
"""Search endpoint - lexical / semantic / hybrid."""
from __future__ import annotations
from fastapi import APIRouter
from app.api.schemas import SearchRequest, SearchResponse
router = APIRouter(prefix="/search", tags=["search"])
@router.post("", response_model=SearchResponse)
def search(req: SearchRequest) -> SearchResponse:
from app.indexing.hybrid_search import run_search
return run_search(req)

99
app/api/schemas.py Normal file
View File

@@ -0,0 +1,99 @@
"""Pydantic request/response schemas for the LegacyHUB API."""
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Any, Literal
from pydantic import BaseModel, Field
# ---------------- Health ----------------
class ComponentHealth(BaseModel):
name: str
status: Literal["ok", "error", "degraded"]
detail: dict[str, Any] = Field(default_factory=dict)
class HealthResponse(BaseModel):
status: Literal["ok", "error", "degraded"]
version: str
components: list[ComponentHealth]
# ---------------- Ingestion ----------------
class IngestFolderRequest(BaseModel):
path: str = Field(..., description="Absolute path inside the API container")
recursive: bool = True
force: bool = False
class IngestFolderResponse(BaseModel):
run_id: uuid.UUID
discovered: int
queued: int
skipped_duplicates: int
invalid_files: int
class DocumentSummary(BaseModel):
id: uuid.UUID
original_file_name: str
source_path: str
sha256: str
status: str
file_size_bytes: int
created_at: datetime
# ---------------- Search ----------------
SearchMode = Literal["lexical", "semantic", "hybrid"]
class SearchFilters(BaseModel):
document_id: uuid.UUID | None = None
source_path: str | None = None
block_type: str | None = None
min_ocr_confidence: float | None = Field(None, ge=0.0, le=1.0)
class SearchRequest(BaseModel):
query: str = Field(..., min_length=1)
limit: int = Field(10, ge=1, le=100)
filters: SearchFilters = Field(default_factory=SearchFilters)
search_mode: SearchMode = "hybrid"
class Citation(BaseModel):
pdf: str
page: int
block_id: str | None = None
table_id: str | None = None
figure_id: str | None = None
class SearchHit(BaseModel):
rank: int
score: float
document_id: uuid.UUID
chunk_id: uuid.UUID
original_file_name: str
source_path: str
page_number: int
block_type: str
text: str
citation: Citation
quality_flags: dict[str, Any] = Field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)
class SearchResponse(BaseModel):
query: str
mode: SearchMode
total_candidates: int
reranked: bool
results: list[SearchHit]