- app/indexing/qdrant_client.py: remove the identity-only _qid()
helper and pass chunk_id straight to PointStruct (Qdrant accepts
the UUID string directly).
- services/types.ts: SearchHit gets an explicit, optional
ocr_confidence field so consumers can type the value instead of
casting through metadata.
- widgets/SearchResultCard.tsx: replaces the
(hit.metadata as { ocr_confidence? }) cast with the new field. No
behavior change when the backend omits it.
tsc --noEmit: clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
"""Qdrant client + collection bootstrap + chunk upsert."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from functools import lru_cache
|
|
from typing import Any, Sequence
|
|
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.http import models as qm
|
|
|
|
from app.config import settings
|
|
from app.logging_config import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
DENSE_VECTOR_NAME = "dense"
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_qdrant() -> QdrantClient:
|
|
return QdrantClient(
|
|
host=settings.qdrant_host,
|
|
port=settings.qdrant_port,
|
|
api_key=settings.qdrant_api_key or None,
|
|
timeout=60,
|
|
)
|
|
|
|
|
|
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
|
|
name = collection or settings.qdrant_collection_chunks
|
|
vector_size = dim or settings.embedding_dim
|
|
client = get_qdrant()
|
|
existing = {c.name for c in client.get_collections().collections}
|
|
if name in existing:
|
|
logger.debug("qdrant.collection.exists", collection=name)
|
|
return
|
|
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
|
|
client.create_collection(
|
|
collection_name=name,
|
|
vectors_config={
|
|
DENSE_VECTOR_NAME: qm.VectorParams(
|
|
size=vector_size,
|
|
distance=qm.Distance.COSINE,
|
|
)
|
|
},
|
|
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
|
|
)
|
|
# Payload indexes for filtering.
|
|
for field in ("document_id", "source_path", "block_type"):
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name=field,
|
|
field_schema=qm.PayloadSchemaType.KEYWORD,
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name="page_number",
|
|
field_schema=qm.PayloadSchemaType.INTEGER,
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name="ocr_confidence",
|
|
field_schema=qm.PayloadSchemaType.FLOAT,
|
|
)
|
|
|
|
|
|
def upsert_chunks(
|
|
points: Sequence[tuple[str, list[float], dict[str, Any]]],
|
|
collection: str | None = None,
|
|
) -> int:
|
|
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
|
|
name = collection or settings.qdrant_collection_chunks
|
|
if not points:
|
|
return 0
|
|
qpoints = [
|
|
qm.PointStruct(
|
|
id=chunk_id,
|
|
vector={DENSE_VECTOR_NAME: vector},
|
|
payload={**payload, "chunk_id": chunk_id},
|
|
)
|
|
for chunk_id, vector, payload in points
|
|
]
|
|
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
|
|
return len(qpoints)
|
|
|
|
|
|
def delete_by_document(document_id: str, collection: str | None = None) -> int:
|
|
name = collection or settings.qdrant_collection_chunks
|
|
client = get_qdrant()
|
|
client.delete(
|
|
collection_name=name,
|
|
points_selector=qm.FilterSelector(
|
|
filter=qm.Filter(
|
|
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
|
|
)
|
|
),
|
|
)
|
|
return 1
|