Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
104 lines
3.1 KiB
Python
104 lines
3.1 KiB
Python
"""Qdrant client + collection bootstrap + chunk upsert."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from functools import lru_cache
|
|
from typing import Any, Sequence
|
|
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.http import models as qm
|
|
|
|
from app.config import settings
|
|
from app.logging_config import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
DENSE_VECTOR_NAME = "dense"
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_qdrant() -> QdrantClient:
|
|
return QdrantClient(
|
|
host=settings.qdrant_host,
|
|
port=settings.qdrant_port,
|
|
api_key=settings.qdrant_api_key or None,
|
|
timeout=60,
|
|
)
|
|
|
|
|
|
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
|
|
name = collection or settings.qdrant_collection_chunks
|
|
vector_size = dim or settings.embedding_dim
|
|
client = get_qdrant()
|
|
existing = {c.name for c in client.get_collections().collections}
|
|
if name in existing:
|
|
logger.debug("qdrant.collection.exists", collection=name)
|
|
return
|
|
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
|
|
client.create_collection(
|
|
collection_name=name,
|
|
vectors_config={
|
|
DENSE_VECTOR_NAME: qm.VectorParams(
|
|
size=vector_size,
|
|
distance=qm.Distance.COSINE,
|
|
)
|
|
},
|
|
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
|
|
)
|
|
# Payload indexes for filtering.
|
|
for field in ("document_id", "source_path", "block_type"):
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name=field,
|
|
field_schema=qm.PayloadSchemaType.KEYWORD,
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name="page_number",
|
|
field_schema=qm.PayloadSchemaType.INTEGER,
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name="ocr_confidence",
|
|
field_schema=qm.PayloadSchemaType.FLOAT,
|
|
)
|
|
|
|
|
|
def upsert_chunks(
|
|
points: Sequence[tuple[str, list[float], dict[str, Any]]],
|
|
collection: str | None = None,
|
|
) -> int:
|
|
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
|
|
name = collection or settings.qdrant_collection_chunks
|
|
if not points:
|
|
return 0
|
|
qpoints = [
|
|
qm.PointStruct(
|
|
id=_qid(chunk_id),
|
|
vector={DENSE_VECTOR_NAME: vector},
|
|
payload={**payload, "chunk_id": chunk_id},
|
|
)
|
|
for chunk_id, vector, payload in points
|
|
]
|
|
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
|
|
return len(qpoints)
|
|
|
|
|
|
def delete_by_document(document_id: str, collection: str | None = None) -> int:
|
|
name = collection or settings.qdrant_collection_chunks
|
|
client = get_qdrant()
|
|
client.delete(
|
|
collection_name=name,
|
|
points_selector=qm.FilterSelector(
|
|
filter=qm.Filter(
|
|
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
|
|
)
|
|
),
|
|
)
|
|
return 1
|
|
|
|
|
|
def _qid(chunk_id: str) -> str:
|
|
"""Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
|
|
return chunk_id
|