chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
142
app/indexing/opensearch_client.py
Normal file
142
app/indexing/opensearch_client.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""OpenSearch client + index bootstrap + chunk indexing helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Any, Iterable
|
||||
|
||||
from opensearchpy import OpenSearch, RequestsHttpConnection
|
||||
from opensearchpy.helpers import bulk
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Index settings: 3 analyzers (russian, english, standard).
|
||||
# We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
|
||||
INDEX_SETTINGS: dict[str, Any] = {
|
||||
"settings": {
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 0,
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"ru_stop": {"type": "stop", "stopwords": "_russian_"},
|
||||
"ru_stemmer": {"type": "stemmer", "language": "russian"},
|
||||
"en_stop": {"type": "stop", "stopwords": "_english_"},
|
||||
"en_stemmer": {"type": "stemmer", "language": "english"},
|
||||
},
|
||||
"analyzer": {
|
||||
"ru_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "ru_stop", "ru_stemmer"],
|
||||
},
|
||||
"en_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "en_stop", "en_stemmer"],
|
||||
},
|
||||
"code_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase"],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"mappings": {
|
||||
"dynamic": "strict",
|
||||
"properties": {
|
||||
"chunk_id": {"type": "keyword"},
|
||||
"document_id": {"type": "keyword"},
|
||||
"source_path": {"type": "keyword"},
|
||||
"original_file_name": {
|
||||
"type": "text",
|
||||
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
|
||||
},
|
||||
"page_number": {"type": "integer"},
|
||||
"block_type": {"type": "keyword"},
|
||||
"block_id": {"type": "keyword"},
|
||||
"text": {
|
||||
"type": "text",
|
||||
"analyzer": "code_analyzer",
|
||||
"fields": {
|
||||
"ru": {"type": "text", "analyzer": "ru_analyzer"},
|
||||
"en": {"type": "text", "analyzer": "en_analyzer"},
|
||||
},
|
||||
},
|
||||
"normalized_text": {
|
||||
"type": "text",
|
||||
"analyzer": "code_analyzer",
|
||||
},
|
||||
"ocr_confidence": {"type": "float"},
|
||||
"language_hint": {"type": "keyword"},
|
||||
"metadata": {"type": "object", "enabled": True},
|
||||
"quality_flags": {"type": "object", "enabled": True},
|
||||
"created_at": {"type": "date"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_opensearch() -> OpenSearch:
|
||||
auth = None
|
||||
if settings.opensearch_user and settings.opensearch_password:
|
||||
auth = (settings.opensearch_user, settings.opensearch_password)
|
||||
return OpenSearch(
|
||||
hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
|
||||
http_auth=auth,
|
||||
use_ssl=settings.opensearch_use_ssl,
|
||||
verify_certs=settings.opensearch_verify_certs,
|
||||
ssl_show_warn=False,
|
||||
connection_class=RequestsHttpConnection,
|
||||
timeout=30,
|
||||
max_retries=3,
|
||||
retry_on_timeout=True,
|
||||
)
|
||||
|
||||
|
||||
def ensure_index(index: str | None = None) -> None:
|
||||
name = index or settings.opensearch_index_chunks
|
||||
client = get_opensearch()
|
||||
if client.indices.exists(index=name):
|
||||
logger.debug("opensearch.index.exists", index=name)
|
||||
return
|
||||
logger.info("opensearch.index.create", index=name)
|
||||
client.indices.create(index=name, body=INDEX_SETTINGS)
|
||||
|
||||
|
||||
def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
|
||||
"""Bulk-upsert chunks. Returns (success, errors)."""
|
||||
name = index or settings.opensearch_index_chunks
|
||||
actions: list[dict[str, Any]] = []
|
||||
for d in docs:
|
||||
actions.append(
|
||||
{
|
||||
"_op_type": "index",
|
||||
"_index": name,
|
||||
"_id": d["chunk_id"],
|
||||
"_source": d,
|
||||
}
|
||||
)
|
||||
if not actions:
|
||||
return 0, 0
|
||||
success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
|
||||
if errors:
|
||||
logger.warning("opensearch.bulk.errors", count=len(errors))
|
||||
return success, len(errors) if isinstance(errors, list) else 0
|
||||
|
||||
|
||||
def delete_by_document(document_id: str, index: str | None = None) -> int:
|
||||
name = index or settings.opensearch_index_chunks
|
||||
client = get_opensearch()
|
||||
if not client.indices.exists(index=name):
|
||||
return 0
|
||||
res = client.delete_by_query(
|
||||
index=name,
|
||||
body={"query": {"term": {"document_id": document_id}}},
|
||||
refresh=True,
|
||||
)
|
||||
return int(res.get("deleted", 0))
|
||||
Reference in New Issue
Block a user