chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

View File

@@ -0,0 +1,103 @@
"""Qdrant client + collection bootstrap + chunk upsert."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Sequence
from qdrant_client import QdrantClient
from qdrant_client.http import models as qm
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
DENSE_VECTOR_NAME = "dense"
@lru_cache(maxsize=1)
def get_qdrant() -> QdrantClient:
return QdrantClient(
host=settings.qdrant_host,
port=settings.qdrant_port,
api_key=settings.qdrant_api_key or None,
timeout=60,
)
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
name = collection or settings.qdrant_collection_chunks
vector_size = dim or settings.embedding_dim
client = get_qdrant()
existing = {c.name for c in client.get_collections().collections}
if name in existing:
logger.debug("qdrant.collection.exists", collection=name)
return
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
client.create_collection(
collection_name=name,
vectors_config={
DENSE_VECTOR_NAME: qm.VectorParams(
size=vector_size,
distance=qm.Distance.COSINE,
)
},
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
)
# Payload indexes for filtering.
for field in ("document_id", "source_path", "block_type"):
client.create_payload_index(
collection_name=name,
field_name=field,
field_schema=qm.PayloadSchemaType.KEYWORD,
)
client.create_payload_index(
collection_name=name,
field_name="page_number",
field_schema=qm.PayloadSchemaType.INTEGER,
)
client.create_payload_index(
collection_name=name,
field_name="ocr_confidence",
field_schema=qm.PayloadSchemaType.FLOAT,
)
def upsert_chunks(
points: Sequence[tuple[str, list[float], dict[str, Any]]],
collection: str | None = None,
) -> int:
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
name = collection or settings.qdrant_collection_chunks
if not points:
return 0
qpoints = [
qm.PointStruct(
id=_qid(chunk_id),
vector={DENSE_VECTOR_NAME: vector},
payload={**payload, "chunk_id": chunk_id},
)
for chunk_id, vector, payload in points
]
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
return len(qpoints)
def delete_by_document(document_id: str, collection: str | None = None) -> int:
name = collection or settings.qdrant_collection_chunks
client = get_qdrant()
client.delete(
collection_name=name,
points_selector=qm.FilterSelector(
filter=qm.Filter(
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
)
),
)
return 1
def _qid(chunk_id: str) -> str:
"""Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
return chunk_id