Files
LegacyHUB/app/indexing/opensearch_client.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

143 lines
4.9 KiB
Python

"""OpenSearch client + index bootstrap + chunk indexing helpers."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Iterable
from opensearchpy import OpenSearch, RequestsHttpConnection
from opensearchpy.helpers import bulk
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
# Index settings: 3 analyzers (russian, english, standard).
# We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
INDEX_SETTINGS: dict[str, Any] = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"ru_stop": {"type": "stop", "stopwords": "_russian_"},
"ru_stemmer": {"type": "stemmer", "language": "russian"},
"en_stop": {"type": "stop", "stopwords": "_english_"},
"en_stemmer": {"type": "stemmer", "language": "english"},
},
"analyzer": {
"ru_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "ru_stop", "ru_stemmer"],
},
"en_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "en_stop", "en_stemmer"],
},
"code_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"],
},
},
},
},
"mappings": {
"dynamic": "strict",
"properties": {
"chunk_id": {"type": "keyword"},
"document_id": {"type": "keyword"},
"source_path": {"type": "keyword"},
"original_file_name": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
},
"page_number": {"type": "integer"},
"block_type": {"type": "keyword"},
"block_id": {"type": "keyword"},
"text": {
"type": "text",
"analyzer": "code_analyzer",
"fields": {
"ru": {"type": "text", "analyzer": "ru_analyzer"},
"en": {"type": "text", "analyzer": "en_analyzer"},
},
},
"normalized_text": {
"type": "text",
"analyzer": "code_analyzer",
},
"ocr_confidence": {"type": "float"},
"language_hint": {"type": "keyword"},
"metadata": {"type": "object", "enabled": True},
"quality_flags": {"type": "object", "enabled": True},
"created_at": {"type": "date"},
},
},
}
@lru_cache(maxsize=1)
def get_opensearch() -> OpenSearch:
auth = None
if settings.opensearch_user and settings.opensearch_password:
auth = (settings.opensearch_user, settings.opensearch_password)
return OpenSearch(
hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
http_auth=auth,
use_ssl=settings.opensearch_use_ssl,
verify_certs=settings.opensearch_verify_certs,
ssl_show_warn=False,
connection_class=RequestsHttpConnection,
timeout=30,
max_retries=3,
retry_on_timeout=True,
)
def ensure_index(index: str | None = None) -> None:
name = index or settings.opensearch_index_chunks
client = get_opensearch()
if client.indices.exists(index=name):
logger.debug("opensearch.index.exists", index=name)
return
logger.info("opensearch.index.create", index=name)
client.indices.create(index=name, body=INDEX_SETTINGS)
def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
"""Bulk-upsert chunks. Returns (success, errors)."""
name = index or settings.opensearch_index_chunks
actions: list[dict[str, Any]] = []
for d in docs:
actions.append(
{
"_op_type": "index",
"_index": name,
"_id": d["chunk_id"],
"_source": d,
}
)
if not actions:
return 0, 0
success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
if errors:
logger.warning("opensearch.bulk.errors", count=len(errors))
return success, len(errors) if isinstance(errors, list) else 0
def delete_by_document(document_id: str, index: str | None = None) -> int:
name = index or settings.opensearch_index_chunks
client = get_opensearch()
if not client.indices.exists(index=name):
return 0
res = client.delete_by_query(
index=name,
body={"query": {"term": {"document_id": document_id}}},
refresh=True,
)
return int(res.get("deleted", 0))