Files
LegacyHUB/app/indexing/qdrant_client.py
Vadim Malanov f42fb978a8 chore: drop dead _qid helper and surface ocr_confidence on SearchHit
- app/indexing/qdrant_client.py: remove the identity-only _qid()
  helper and pass chunk_id straight to PointStruct (Qdrant accepts
  the UUID string directly).
- services/types.ts: SearchHit gets an explicit, optional
  ocr_confidence field so consumers can type the value instead of
  casting through metadata.
- widgets/SearchResultCard.tsx: replaces the
  (hit.metadata as { ocr_confidence? }) cast with the new field. No
  behavior change when the backend omits it.

tsc --noEmit: clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:55:32 +03:00

99 lines
3.0 KiB
Python

"""Qdrant client + collection bootstrap + chunk upsert."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Sequence
from qdrant_client import QdrantClient
from qdrant_client.http import models as qm
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
DENSE_VECTOR_NAME = "dense"
@lru_cache(maxsize=1)
def get_qdrant() -> QdrantClient:
return QdrantClient(
host=settings.qdrant_host,
port=settings.qdrant_port,
api_key=settings.qdrant_api_key or None,
timeout=60,
)
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
name = collection or settings.qdrant_collection_chunks
vector_size = dim or settings.embedding_dim
client = get_qdrant()
existing = {c.name for c in client.get_collections().collections}
if name in existing:
logger.debug("qdrant.collection.exists", collection=name)
return
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
client.create_collection(
collection_name=name,
vectors_config={
DENSE_VECTOR_NAME: qm.VectorParams(
size=vector_size,
distance=qm.Distance.COSINE,
)
},
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
)
# Payload indexes for filtering.
for field in ("document_id", "source_path", "block_type"):
client.create_payload_index(
collection_name=name,
field_name=field,
field_schema=qm.PayloadSchemaType.KEYWORD,
)
client.create_payload_index(
collection_name=name,
field_name="page_number",
field_schema=qm.PayloadSchemaType.INTEGER,
)
client.create_payload_index(
collection_name=name,
field_name="ocr_confidence",
field_schema=qm.PayloadSchemaType.FLOAT,
)
def upsert_chunks(
points: Sequence[tuple[str, list[float], dict[str, Any]]],
collection: str | None = None,
) -> int:
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
name = collection or settings.qdrant_collection_chunks
if not points:
return 0
qpoints = [
qm.PointStruct(
id=chunk_id,
vector={DENSE_VECTOR_NAME: vector},
payload={**payload, "chunk_id": chunk_id},
)
for chunk_id, vector, payload in points
]
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
return len(qpoints)
def delete_by_document(document_id: str, collection: str | None = None) -> int:
name = collection or settings.qdrant_collection_chunks
client = get_qdrant()
client.delete(
collection_name=name,
points_selector=qm.FilterSelector(
filter=qm.Filter(
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
)
),
)
return 1