LegacyHUB/app/indexing/opensearch_client.py

"""OpenSearch client + index bootstrap + chunk indexing helpers."""

from __future__ import annotations

from functools import lru_cache
from typing import Any, Iterable

from opensearchpy import OpenSearch, RequestsHttpConnection
from opensearchpy.helpers import bulk

from app.config import settings
from app.logging_config import get_logger

logger = get_logger(__name__)

# Index settings: 3 analyzers (russian, english, standard).
# We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
INDEX_SETTINGS: dict[str, Any] = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "filter": {
                "ru_stop": {"type": "stop", "stopwords": "_russian_"},
                "ru_stemmer": {"type": "stemmer", "language": "russian"},
                "en_stop": {"type": "stop", "stopwords": "_english_"},
                "en_stemmer": {"type": "stemmer", "language": "english"},
            },
            "analyzer": {
                "ru_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "ru_stop", "ru_stemmer"],
                },
                "en_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "en_stop", "en_stemmer"],
                },
                "code_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase"],
                },
            },
        },
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "chunk_id": {"type": "keyword"},
            "document_id": {"type": "keyword"},
            "source_path": {"type": "keyword"},
            "original_file_name": {
                "type": "text",
                "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
            },
            "page_number": {"type": "integer"},
            "block_type": {"type": "keyword"},
            "block_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "code_analyzer",
                "fields": {
                    "ru": {"type": "text", "analyzer": "ru_analyzer"},
                    "en": {"type": "text", "analyzer": "en_analyzer"},
                },
            },
            "normalized_text": {
                "type": "text",
                "analyzer": "code_analyzer",
            },
            "ocr_confidence": {"type": "float"},
            "language_hint": {"type": "keyword"},
            "metadata": {"type": "object", "enabled": True},
            "quality_flags": {"type": "object", "enabled": True},
            "created_at": {"type": "date"},
        },
    },
}


@lru_cache(maxsize=1)
def get_opensearch() -> OpenSearch:
    auth = None
    if settings.opensearch_user and settings.opensearch_password:
        auth = (settings.opensearch_user, settings.opensearch_password)
    return OpenSearch(
        hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
        http_auth=auth,
        use_ssl=settings.opensearch_use_ssl,
        verify_certs=settings.opensearch_verify_certs,
        ssl_show_warn=False,
        connection_class=RequestsHttpConnection,
        timeout=30,
        max_retries=3,
        retry_on_timeout=True,
    )


def ensure_index(index: str | None = None) -> None:
    name = index or settings.opensearch_index_chunks
    client = get_opensearch()
    if client.indices.exists(index=name):
        logger.debug("opensearch.index.exists", index=name)
        return
    logger.info("opensearch.index.create", index=name)
    client.indices.create(index=name, body=INDEX_SETTINGS)


def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
    """Bulk-upsert chunks. Returns (success, errors)."""
    name = index or settings.opensearch_index_chunks
    actions: list[dict[str, Any]] = []
    for d in docs:
        actions.append(
            {
                "_op_type": "index",
                "_index": name,
                "_id": d["chunk_id"],
                "_source": d,
            }
        )
    if not actions:
        return 0, 0
    success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
    if errors:
        logger.warning("opensearch.bulk.errors", count=len(errors))
    return success, len(errors) if isinstance(errors, list) else 0


def delete_by_document(document_id: str, index: str | None = None) -> int:
    name = index or settings.opensearch_index_chunks
    client = get_opensearch()
    if not client.indices.exists(index=name):
        return 0
    res = client.delete_by_query(
        index=name,
        body={"query": {"term": {"document_id": document_id}}},
        refresh=True,
    )
    return int(res.get("deleted", 0))