"""Qdrant client + collection bootstrap + chunk upsert.""" from __future__ import annotations from functools import lru_cache from typing import Any, Sequence from qdrant_client import QdrantClient from qdrant_client.http import models as qm from app.config import settings from app.logging_config import get_logger logger = get_logger(__name__) DENSE_VECTOR_NAME = "dense" @lru_cache(maxsize=1) def get_qdrant() -> QdrantClient: return QdrantClient( host=settings.qdrant_host, port=settings.qdrant_port, api_key=settings.qdrant_api_key or None, timeout=60, ) def ensure_collection(collection: str | None = None, dim: int | None = None) -> None: name = collection or settings.qdrant_collection_chunks vector_size = dim or settings.embedding_dim client = get_qdrant() existing = {c.name for c in client.get_collections().collections} if name in existing: logger.debug("qdrant.collection.exists", collection=name) return logger.info("qdrant.collection.create", collection=name, dim=vector_size) client.create_collection( collection_name=name, vectors_config={ DENSE_VECTOR_NAME: qm.VectorParams( size=vector_size, distance=qm.Distance.COSINE, ) }, optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2), ) # Payload indexes for filtering. for field in ("document_id", "source_path", "block_type"): client.create_payload_index( collection_name=name, field_name=field, field_schema=qm.PayloadSchemaType.KEYWORD, ) client.create_payload_index( collection_name=name, field_name="page_number", field_schema=qm.PayloadSchemaType.INTEGER, ) client.create_payload_index( collection_name=name, field_name="ocr_confidence", field_schema=qm.PayloadSchemaType.FLOAT, ) def upsert_chunks( points: Sequence[tuple[str, list[float], dict[str, Any]]], collection: str | None = None, ) -> int: """Upsert (chunk_id, vector, payload) triples. Returns count upserted.""" name = collection or settings.qdrant_collection_chunks if not points: return 0 qpoints = [ qm.PointStruct( id=chunk_id, vector={DENSE_VECTOR_NAME: vector}, payload={**payload, "chunk_id": chunk_id}, ) for chunk_id, vector, payload in points ] get_qdrant().upsert(collection_name=name, points=qpoints, wait=False) return len(qpoints) def delete_by_document(document_id: str, collection: str | None = None) -> int: name = collection or settings.qdrant_collection_chunks client = get_qdrant() client.delete( collection_name=name, points_selector=qm.FilterSelector( filter=qm.Filter( must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))] ) ), ) return 1