"""OpenSearch client + index bootstrap + chunk indexing helpers.""" from __future__ import annotations from functools import lru_cache from typing import Any, Iterable from opensearchpy import OpenSearch, RequestsHttpConnection from opensearchpy.helpers import bulk from app.config import settings from app.logging_config import get_logger logger = get_logger(__name__) # Index settings: 3 analyzers (russian, english, standard). # We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time. INDEX_SETTINGS: dict[str, Any] = { "settings": { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "filter": { "ru_stop": {"type": "stop", "stopwords": "_russian_"}, "ru_stemmer": {"type": "stemmer", "language": "russian"}, "en_stop": {"type": "stop", "stopwords": "_english_"}, "en_stemmer": {"type": "stemmer", "language": "english"}, }, "analyzer": { "ru_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "ru_stop", "ru_stemmer"], }, "en_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "en_stop", "en_stemmer"], }, "code_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase"], }, }, }, }, "mappings": { "dynamic": "strict", "properties": { "chunk_id": {"type": "keyword"}, "document_id": {"type": "keyword"}, "source_path": {"type": "keyword"}, "original_file_name": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 512}}, }, "page_number": {"type": "integer"}, "block_type": {"type": "keyword"}, "block_id": {"type": "keyword"}, "text": { "type": "text", "analyzer": "code_analyzer", "fields": { "ru": {"type": "text", "analyzer": "ru_analyzer"}, "en": {"type": "text", "analyzer": "en_analyzer"}, }, }, "normalized_text": { "type": "text", "analyzer": "code_analyzer", }, "ocr_confidence": {"type": "float"}, "language_hint": {"type": "keyword"}, "metadata": {"type": "object", "enabled": True}, "quality_flags": {"type": "object", "enabled": True}, "created_at": {"type": "date"}, }, }, } @lru_cache(maxsize=1) def get_opensearch() -> OpenSearch: auth = None if settings.opensearch_user and settings.opensearch_password: auth = (settings.opensearch_user, settings.opensearch_password) return OpenSearch( hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}], http_auth=auth, use_ssl=settings.opensearch_use_ssl, verify_certs=settings.opensearch_verify_certs, ssl_show_warn=False, connection_class=RequestsHttpConnection, timeout=30, max_retries=3, retry_on_timeout=True, ) def ensure_index(index: str | None = None) -> None: name = index or settings.opensearch_index_chunks client = get_opensearch() if client.indices.exists(index=name): logger.debug("opensearch.index.exists", index=name) return logger.info("opensearch.index.create", index=name) client.indices.create(index=name, body=INDEX_SETTINGS) def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]: """Bulk-upsert chunks. Returns (success, errors).""" name = index or settings.opensearch_index_chunks actions: list[dict[str, Any]] = [] for d in docs: actions.append( { "_op_type": "index", "_index": name, "_id": d["chunk_id"], "_source": d, } ) if not actions: return 0, 0 success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120) if errors: logger.warning("opensearch.bulk.errors", count=len(errors)) return success, len(errors) if isinstance(errors, list) else 0 def delete_by_document(document_id: str, index: str | None = None) -> int: name = index or settings.opensearch_index_chunks client = get_opensearch() if not client.indices.exists(index=name): return 0 res = client.delete_by_query( index=name, body={"query": {"term": {"document_id": document_id}}}, refresh=True, ) return int(res.get("deleted", 0))