chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
96
app/api/routes_health.py
Normal file
96
app/api/routes_health.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Health endpoint - probes Postgres, MinIO, OpenSearch, Qdrant, Redis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter
|
||||
from sqlalchemy import text
|
||||
|
||||
from app import __version__
|
||||
from app.api.schemas import ComponentHealth, HealthResponse
|
||||
from app.config import settings
|
||||
from app.db.session import get_engine
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.minio_client import get_storage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(tags=["health"])
|
||||
|
||||
|
||||
def _check_postgres() -> ComponentHealth:
|
||||
try:
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(text("SELECT 1"))
|
||||
return ComponentHealth(name="postgres", status="ok")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="postgres", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_minio() -> ComponentHealth:
|
||||
info: dict[str, Any] = get_storage().health()
|
||||
if info.get("status") == "ok":
|
||||
return ComponentHealth(name="minio", status="ok", detail=info)
|
||||
return ComponentHealth(name="minio", status="error", detail=info)
|
||||
|
||||
|
||||
def _check_opensearch() -> ComponentHealth:
|
||||
try:
|
||||
from app.indexing.opensearch_client import get_opensearch
|
||||
|
||||
client = get_opensearch()
|
||||
info = client.cluster.health()
|
||||
cluster_status = info.get("status")
|
||||
status = "ok" if cluster_status in ("green", "yellow") else "degraded"
|
||||
return ComponentHealth(
|
||||
name="opensearch",
|
||||
status=status, # type: ignore[arg-type]
|
||||
detail={"cluster_status": cluster_status, "nodes": info.get("number_of_nodes")},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="opensearch", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_qdrant() -> ComponentHealth:
|
||||
try:
|
||||
from app.indexing.qdrant_client import get_qdrant
|
||||
|
||||
client = get_qdrant()
|
||||
cols = client.get_collections()
|
||||
return ComponentHealth(
|
||||
name="qdrant",
|
||||
status="ok",
|
||||
detail={"collections": [c.name for c in cols.collections]},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="qdrant", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_redis() -> ComponentHealth:
|
||||
try:
|
||||
import redis
|
||||
|
||||
r = redis.Redis.from_url(settings.redis_url, socket_connect_timeout=2)
|
||||
r.ping()
|
||||
return ComponentHealth(name="redis", status="ok")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="redis", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
@router.get("/health", response_model=HealthResponse)
|
||||
def health() -> HealthResponse:
|
||||
components = [
|
||||
_check_postgres(),
|
||||
_check_minio(),
|
||||
_check_opensearch(),
|
||||
_check_qdrant(),
|
||||
_check_redis(),
|
||||
]
|
||||
if any(c.status == "error" for c in components):
|
||||
overall = "error"
|
||||
elif any(c.status == "degraded" for c in components):
|
||||
overall = "degraded"
|
||||
else:
|
||||
overall = "ok"
|
||||
return HealthResponse(status=overall, version=__version__, components=components) # type: ignore[arg-type]
|
||||
63
app/api/routes_ingestion.py
Normal file
63
app/api/routes_ingestion.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""Ingestion endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from app.api.schemas import IngestFolderRequest, IngestFolderResponse
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/ingest", tags=["ingestion"])
|
||||
|
||||
|
||||
@router.post("/folder", response_model=IngestFolderResponse)
|
||||
def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
|
||||
"""Discover all PDFs under ``path`` and queue them for processing.
|
||||
|
||||
The request returns immediately after the discovery pass. Per-document
|
||||
OCR / extraction / indexing happens asynchronously in Celery workers.
|
||||
"""
|
||||
folder = Path(req.path)
|
||||
if not folder.exists() or not folder.is_dir():
|
||||
raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
|
||||
|
||||
# Lazy import - keeps module load light.
|
||||
from app.ingestion.scanner import discover_documents
|
||||
from app.workers.tasks import process_document
|
||||
|
||||
run_id = uuid.uuid4()
|
||||
discovered, queued, dups, invalid = 0, 0, 0, 0
|
||||
|
||||
for record in discover_documents(folder, recursive=req.recursive, force=req.force):
|
||||
discovered += 1
|
||||
if record.duplicate and not req.force:
|
||||
dups += 1
|
||||
continue
|
||||
if not record.document_id:
|
||||
invalid += 1
|
||||
continue
|
||||
process_document.delay(str(record.document_id), str(run_id))
|
||||
queued += 1
|
||||
|
||||
logger.info(
|
||||
"ingest.folder.queued",
|
||||
path=str(folder),
|
||||
discovered=discovered,
|
||||
queued=queued,
|
||||
skipped_duplicates=dups,
|
||||
invalid=invalid,
|
||||
run_id=str(run_id),
|
||||
)
|
||||
|
||||
return IngestFolderResponse(
|
||||
run_id=run_id,
|
||||
discovered=discovered,
|
||||
queued=queued,
|
||||
skipped_duplicates=dups,
|
||||
invalid_files=invalid,
|
||||
)
|
||||
16
app/api/routes_search.py
Normal file
16
app/api/routes_search.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""Search endpoint - lexical / semantic / hybrid."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.api.schemas import SearchRequest, SearchResponse
|
||||
|
||||
router = APIRouter(prefix="/search", tags=["search"])
|
||||
|
||||
|
||||
@router.post("", response_model=SearchResponse)
|
||||
def search(req: SearchRequest) -> SearchResponse:
|
||||
from app.indexing.hybrid_search import run_search
|
||||
|
||||
return run_search(req)
|
||||
99
app/api/schemas.py
Normal file
99
app/api/schemas.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Pydantic request/response schemas for the LegacyHUB API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ---------------- Health ----------------
|
||||
|
||||
class ComponentHealth(BaseModel):
|
||||
name: str
|
||||
status: Literal["ok", "error", "degraded"]
|
||||
detail: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: Literal["ok", "error", "degraded"]
|
||||
version: str
|
||||
components: list[ComponentHealth]
|
||||
|
||||
|
||||
# ---------------- Ingestion ----------------
|
||||
|
||||
class IngestFolderRequest(BaseModel):
|
||||
path: str = Field(..., description="Absolute path inside the API container")
|
||||
recursive: bool = True
|
||||
force: bool = False
|
||||
|
||||
|
||||
class IngestFolderResponse(BaseModel):
|
||||
run_id: uuid.UUID
|
||||
discovered: int
|
||||
queued: int
|
||||
skipped_duplicates: int
|
||||
invalid_files: int
|
||||
|
||||
|
||||
class DocumentSummary(BaseModel):
|
||||
id: uuid.UUID
|
||||
original_file_name: str
|
||||
source_path: str
|
||||
sha256: str
|
||||
status: str
|
||||
file_size_bytes: int
|
||||
created_at: datetime
|
||||
|
||||
|
||||
# ---------------- Search ----------------
|
||||
|
||||
SearchMode = Literal["lexical", "semantic", "hybrid"]
|
||||
|
||||
|
||||
class SearchFilters(BaseModel):
|
||||
document_id: uuid.UUID | None = None
|
||||
source_path: str | None = None
|
||||
block_type: str | None = None
|
||||
min_ocr_confidence: float | None = Field(None, ge=0.0, le=1.0)
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str = Field(..., min_length=1)
|
||||
limit: int = Field(10, ge=1, le=100)
|
||||
filters: SearchFilters = Field(default_factory=SearchFilters)
|
||||
search_mode: SearchMode = "hybrid"
|
||||
|
||||
|
||||
class Citation(BaseModel):
|
||||
pdf: str
|
||||
page: int
|
||||
block_id: str | None = None
|
||||
table_id: str | None = None
|
||||
figure_id: str | None = None
|
||||
|
||||
|
||||
class SearchHit(BaseModel):
|
||||
rank: int
|
||||
score: float
|
||||
document_id: uuid.UUID
|
||||
chunk_id: uuid.UUID
|
||||
original_file_name: str
|
||||
source_path: str
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
citation: Citation
|
||||
quality_flags: dict[str, Any] = Field(default_factory=dict)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
query: str
|
||||
mode: SearchMode
|
||||
total_candidates: int
|
||||
reranked: bool
|
||||
results: list[SearchHit]
|
||||
Reference in New Issue
Block a user