Files
LegacyHUB/app/indexing/qdrant_client.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

104 lines
3.1 KiB
Python

"""Qdrant client + collection bootstrap + chunk upsert."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Sequence
from qdrant_client import QdrantClient
from qdrant_client.http import models as qm
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
DENSE_VECTOR_NAME = "dense"
@lru_cache(maxsize=1)
def get_qdrant() -> QdrantClient:
return QdrantClient(
host=settings.qdrant_host,
port=settings.qdrant_port,
api_key=settings.qdrant_api_key or None,
timeout=60,
)
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
name = collection or settings.qdrant_collection_chunks
vector_size = dim or settings.embedding_dim
client = get_qdrant()
existing = {c.name for c in client.get_collections().collections}
if name in existing:
logger.debug("qdrant.collection.exists", collection=name)
return
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
client.create_collection(
collection_name=name,
vectors_config={
DENSE_VECTOR_NAME: qm.VectorParams(
size=vector_size,
distance=qm.Distance.COSINE,
)
},
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
)
# Payload indexes for filtering.
for field in ("document_id", "source_path", "block_type"):
client.create_payload_index(
collection_name=name,
field_name=field,
field_schema=qm.PayloadSchemaType.KEYWORD,
)
client.create_payload_index(
collection_name=name,
field_name="page_number",
field_schema=qm.PayloadSchemaType.INTEGER,
)
client.create_payload_index(
collection_name=name,
field_name="ocr_confidence",
field_schema=qm.PayloadSchemaType.FLOAT,
)
def upsert_chunks(
points: Sequence[tuple[str, list[float], dict[str, Any]]],
collection: str | None = None,
) -> int:
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
name = collection or settings.qdrant_collection_chunks
if not points:
return 0
qpoints = [
qm.PointStruct(
id=_qid(chunk_id),
vector={DENSE_VECTOR_NAME: vector},
payload={**payload, "chunk_id": chunk_id},
)
for chunk_id, vector, payload in points
]
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
return len(qpoints)
def delete_by_document(document_id: str, collection: str | None = None) -> int:
name = collection or settings.qdrant_collection_chunks
client = get_qdrant()
client.delete(
collection_name=name,
points_selector=qm.FilterSelector(
filter=qm.Filter(
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
)
),
)
return 1
def _qid(chunk_id: str) -> str:
"""Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
return chunk_id