chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,16 @@
 .git
 .gitignore
 .venv
 venv
 __pycache__
 *.pyc
 .env
 .env.local
 data
 tests
 .pytest_cache
 .mypy_cache
 .ruff_cache
 .idea
 .vscode
 README.md
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,82 @@
 # ---- DEVELOPMENT TEMPLATE ----
 # Copy to .env. Values below are intentionally weak defaults for local Docker
 # Compose. NEVER use them in production — see .env.prod.example.
 # ==== PostgreSQL ====
 POSTGRES_HOST=postgres
 POSTGRES_PORT=5432
 POSTGRES_DB=legacyhub
 POSTGRES_USER=legacyhub
 POSTGRES_PASSWORD=legacyhub
 # ==== MinIO ====
 MINIO_ENDPOINT=minio:9000
 MINIO_ACCESS_KEY=legacyhub
 MINIO_SECRET_KEY=legacyhub-secret
 MINIO_BUCKET_ORIGINALS=legacyhub-originals
 MINIO_BUCKET_DERIVED=legacyhub-derived
 MINIO_SECURE=false
 MINIO_REGION=us-east-1
 # ==== OpenSearch ====
 OPENSEARCH_HOST=opensearch
 OPENSEARCH_PORT=9200
 OPENSEARCH_USE_SSL=false
 OPENSEARCH_VERIFY_CERTS=false
 OPENSEARCH_USER=
 OPENSEARCH_PASSWORD=
 OPENSEARCH_INDEX_CHUNKS=legacy_chunks
 # ==== Qdrant ====
 QDRANT_HOST=qdrant
 QDRANT_PORT=6333
 QDRANT_API_KEY=
 QDRANT_COLLECTION_CHUNKS=legacy_chunks
 # ==== Redis ====
 REDIS_URL=redis://redis:6379/0
 # ==== OCR ====
 OCR_LANGUAGES=rus+eng
 OCR_ENABLED=true
 DOCLING_OCR_ENABLED=false
 MAX_DOCUMENT_TIMEOUT_SECONDS=180
 OCR_DESKEW=true
 OCR_CLEAN=true
 OCR_OPTIMIZE=1
 # ==== Embeddings / Reranker ====
 EMBEDDING_MODEL=BAAI/bge-m3
 EMBEDDING_DIM=1024
 EMBEDDING_DEVICE=cpu
 EMBEDDING_BATCH_SIZE=8
 EMBEDDING_NORMALIZE=true
 RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 RERANKER_DEVICE=cpu
 RERANKER_ENABLED=true
 RERANKER_BATCH_SIZE=8
 # ==== Chunking ====
 CHUNK_TARGET_TOKENS=700
 CHUNK_MIN_TOKENS=120
 CHUNK_MAX_TOKENS=900
 CHUNK_OVERLAP_TOKENS=100
 # ==== Search ====
 HYBRID_OPENSEARCH_TOP_K=50
 HYBRID_QDRANT_TOP_K=50
 HYBRID_RRF_K=60
 RERANK_CANDIDATES=40
 # ==== App ====
 APP_LOG_LEVEL=INFO
 APP_HOST=0.0.0.0
 APP_PORT=8000
 APP_INPUT_DIR=/data/input
 APP_WORK_DIR=/data/work
 APP_API_PREFIX=/api/v1
 # Comma-separated list of allowed origins for the browser. Use specific origins
 # in production; * is accepted only for local development.
 CORS_ALLOWED_ORIGINS=http://localhost:5173,http://localhost:5273,http://localhost:4173
--- a/.env.prod.example
+++ b/.env.prod.example
@@ -0,0 +1,74 @@
 # ---- PRODUCTION TEMPLATE ----
 # Copy to .env.prod and replace every PLACEHOLDER value.
 # Never commit .env.prod.
 # All values below are placeholders — rotation required before use.
 # ==== PostgreSQL ====
 POSTGRES_HOST=postgres
 POSTGRES_PORT=5432
 POSTGRES_DB=legacyhub
 POSTGRES_USER=legacyhub_prod
 POSTGRES_PASSWORD=__ROTATE_ME__
 # ==== MinIO ====
 MINIO_ENDPOINT=minio:9000
 MINIO_ACCESS_KEY=__ROTATE_ME__
 MINIO_SECRET_KEY=__ROTATE_ME__
 MINIO_BUCKET_ORIGINALS=legacyhub-originals
 MINIO_BUCKET_DERIVED=legacyhub-derived
 MINIO_SECURE=true
 MINIO_REGION=us-east-1
 # ==== OpenSearch (security plugin ON in prod overlay) ====
 OPENSEARCH_HOST=opensearch
 OPENSEARCH_PORT=9200
 OPENSEARCH_USE_SSL=true
 OPENSEARCH_VERIFY_CERTS=true
 OPENSEARCH_USER=admin
 OPENSEARCH_PASSWORD=__ROTATE_ME__
 OPENSEARCH_INDEX_CHUNKS=legacy_chunks
 OPENSEARCH_ADMIN_PASSWORD=__ROTATE_ME__
 # ==== Qdrant ====
 QDRANT_HOST=qdrant
 QDRANT_PORT=6333
 QDRANT_API_KEY=__ROTATE_ME__
 QDRANT_COLLECTION_CHUNKS=legacy_chunks
 # ==== Redis ====
 REDIS_URL=redis://:__ROTATE_ME__@redis:6379/0
 # ==== OCR ====
 OCR_LANGUAGES=rus+eng
 OCR_ENABLED=true
 DOCLING_OCR_ENABLED=false
 MAX_DOCUMENT_TIMEOUT_SECONDS=300
 # ==== Embeddings / Reranker ====
 EMBEDDING_MODEL=BAAI/bge-m3
 EMBEDDING_DIM=1024
 EMBEDDING_DEVICE=cuda
 EMBEDDING_BATCH_SIZE=32
 EMBEDDING_NORMALIZE=true
 RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 RERANKER_DEVICE=cuda
 RERANKER_ENABLED=true
 RERANKER_BATCH_SIZE=32
 # ==== Hybrid search ====
 HYBRID_OPENSEARCH_TOP_K=50
 HYBRID_QDRANT_TOP_K=50
 HYBRID_RRF_K=60
 RERANK_CANDIDATES=40
 # ==== App ====
 APP_LOG_LEVEL=INFO
 APP_HOST=0.0.0.0
 APP_PORT=8000
 APP_INPUT_DIR=/data/input
 APP_WORK_DIR=/data/work
 APP_API_PREFIX=/api/v1
 # Comma-separated list of allowed origins. NEVER use * in production.
 CORS_ALLOWED_ORIGINS=https://legacyhub.teamhub.example
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,23 @@
 * text=auto eol=lf
 *.py    text eol=lf
 *.ts    text eol=lf
 *.tsx   text eol=lf
 *.css   text eol=lf
 *.md    text eol=lf
 *.json  text eol=lf
 *.yml   text eol=lf
 *.yaml  text eol=lf
 *.toml  text eol=lf
 *.ini   text eol=lf
 *.mako  text eol=lf
 *.svg   text eol=lf
 *.cfg   text eol=lf
 Dockerfile text eol=lf
 Makefile   text eol=lf
 *.png   binary
 *.jpg   binary
 *.gif   binary
 *.pdf   binary
 *.ico   binary
 *.woff  binary
 *.woff2 binary
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,21 @@
 __pycache__/
 *.py[cod]
 *.egg-info/
 .eggs/
 build/
 dist/
 .venv/
 venv/
 .env
 .env.local
 .idea/
 .vscode/
 .mypy_cache/
 .pytest_cache/
 .ruff_cache/
 data/input/*
 data/work/*
 !data/input/.gitkeep
 !data/work/.gitkeep
 *.log
 .DS_Store
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,147 @@
 # AGENTS — LegacyHUB
 Operating instructions for AI agents working inside this repository.
 ## What this project is
 LegacyHUB ingests legacy PDF archives at scale (~70k docs), runs OCR
 (OCRmyPDF/Tesseract), extracts structured content with Docling, indexes chunks
 into PostgreSQL + OpenSearch (BM25) + Qdrant (BGE-M3 dense), and serves a
 hybrid lexical + semantic search API (FastAPI) reranked by BGE.
 It is one module of the TeamHUB Suite.
 ## Stack (canonical)
 | Layer    | Tech                                          |
 |----------|-----------------------------------------------|
 | API      | FastAPI, Pydantic v2, SQLAlchemy 2, Alembic   |
 | Workers  | Celery + Redis                                |
 | OCR      | OCRmyPDF + Tesseract (rus+eng)                |
 | Extract  | Docling                                       |
 | Store    | PostgreSQL 16, MinIO, OpenSearch 2.x, Qdrant  |
 | ML       | BAAI/bge-m3 (dense, 1024), bge-reranker-v2-m3 |
 | Frontend | React 18, TS 5, Vite 5, Tailwind, shadcn, TanStack Query, Zustand, Framer Motion, Recharts |
 | Tests    | pytest                                        |
 | CI       | GitHub Actions                                |
 ## Entry points
 - **Backend API** — `app/main.py` (`uvicorn app.main:app`)
 - **Celery worker** — `celery -A app.workers.celery_app worker`
 - **CLI scripts** — `scripts/init_db.py`, `scripts/init_opensearch.py`,
  `scripts/init_qdrant.py`, `scripts/ingest_folder.py`,
  `scripts/reindex_document.py`, `scripts/smoke_test.py`
 - **Frontend dev** — `cd frontend && npm run dev` (port 5273)
 - **Docker** — `docker compose up -d --build` (dev), `docker compose -f
  docker-compose.yml -f docker-compose.prod.yml ...` (prod)
 ## Inventory
 ```text
 legacy-knowledge-indexer/
  app/
    api/             routers + Pydantic schemas
    db/              SQLAlchemy models + Alembic migrations
    indexing/        OpenSearch + Qdrant clients, embeddings, reranker, hybrid
    ingestion/       scanner, OCR, Docling, chunker, table/figure processors,
                     quality, pipeline
    storage/         MinIO client + key conventions + ensure_artifact helper
    utils/           hashing, text cleaning, language detection, pdf helpers
    workers/         Celery app + tasks
  scripts/           init / ingest / reindex / smoke CLIs
  tests/             pytest suite
  docker/Dockerfile  API + worker image (OCRmyPDF + tesseract-rus+eng)
  docker-compose.yml dev orchestration
  docker-compose.prod.yml  production overlay
  frontend/          React app — see frontend/README.md
  .github/workflows  CI gate (ruff + pytest + tsc + vite build + compose config)
 ```
 ## Code discovery order
 Bounded discovery order for this repo. Use the first available that returns a
 usable answer; mark the rest "not available" for the task.
 1. **Grep / rg** — reliable fallback, always available. First choice for
   strings, configs, docs, scripts, route paths, hashes.
 2. **Glob** — file shape lookups (`app/**/*.py`).
 3. **Semantic search** (if Sourcegraph, Zoekt, or Serena MCP is configured at
   user level) — go-to-symbol, references. Document the smoke command before
   relying on results.
 4. **Docling / extracted Markdown in MinIO** — for content questions about
   ingested documents, not source code.
 Smoke command for layer 1:
 ```bash
 rg --version && rg "@router" app/api -n
 ```
 If any indexer times out or returns stale results, capture the error and fall
 through. Do not retry the same failing indexer.
 ## Module contracts (high level)
 - `app/ingestion/pipeline.py::process_document_id(document_id, run_id)` — single
  document end-to-end. Idempotent. Returns `{status, chunks, error?}`.
 - `app/indexing/hybrid_search.py::run_search(SearchRequest) -> SearchResponse` —
  the only public search entry. Lexical + semantic + reranker.
 - `app/storage/artifacts.py::ensure_artifact(...)` — single source of truth for
  `document_artifacts` upsert. Used by scanner, pipeline, table_processor,
  figure_processor.
 - `app/storage/minio_client.py::MinioStorage` — bucket bootstrap + retryable
  put/get. Never bypass for object IO.
 - `app/indexing/opensearch_client.py::ensure_index() / index_chunks()` — chunk
  index lifecycle.
 - `app/indexing/qdrant_client.py::ensure_collection() / upsert_chunks()` —
  vector index lifecycle.
 ## Runtime vs legacy scope
 Everything under `app/` is runtime. `scripts/` are operational tools. `tests/`
 are non-runtime. There is no archived/legacy code yet.
 ## Baseline checks
 ```bash
 # Backend
 python -m pip check
 python -m compileall -q app scripts tests
 python -m pytest tests/ -q
 # Frontend
 cd frontend
 npx tsc --noEmit
 npm run lint
 npm run build
 # Docker
 docker compose config --quiet
 ```
 ## Operating rules for agents
 - Inspect before changing. `git status` first.
 - Small reviewable commits. One ownership boundary per commit.
 - Do not delete files, routes, migrations, or env vars without evidence (see
  `software-project-delivery-governance` skill).
 - Do not invent secret values. Use `.env.example` placeholders.
 - Use `ensure_artifact` instead of re-implementing artifact upsert.
 - Use existing UI primitives in `frontend/src/components/ui/*` before adding new
  ones.
 - Never commit `node_modules/`, `dist/`, `.env`, `data/input/*`, `data/work/*`.
 - Failures must be logged via `processing_events` (backend) or `sonner` toast
  (frontend) — not silenced.
 ## Ownership
 - Backend, ingestion, search — Vadim Malanov.
 - Frontend, design system — Vadim Malanov.
 ## Where to update what
 - New behavior — update `README.md`.
 - New repeated agent rule — update this file.
 - New deployment / recovery step — update `RUNBOOK.md`.
 - Cleanup findings — `docs/cleanup-report.md` (create on demand).
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for describing the origin of the Work and
      reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Support. While redistributing
      the Work or Derivative Works thereof, You may accept and charge a
      fee for acceptance of support, warranty, indemnity, or other liability
      obligations and/or rights consistent with this License. However, in
      accepting such obligations, You may act only on Your own behalf and on
      Your sole responsibility, not on behalf of any other Contributor, and
      only if You agree to indemnify, defend, and hold each Contributor
      harmless for any liability incurred by, or claims asserted against,
      such Contributor by reason of your accepting any such warranty or
      support.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed by" line as the copyright notice for easier
      identification within third-party archives.
   Copyright 2026 TeamHUB
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
   implied. See the License for the specific language governing
   permissions and limitations under the License.
--- a/39
+++ b/39
@@ -0,0 +1,39 @@
 .PHONY: help up down logs build api worker init smoke test fmt lint
 help:
 	@echo "make up        - start all services"
 	@echo "make down      - stop all services"
 	@echo "make build     - rebuild api/worker image"
 	@echo "make init      - run db migrations + bootstrap opensearch + qdrant"
 	@echo "make smoke     - run the smoke test inside the api container"
 	@echo "make test      - pytest"
 	@echo "make logs      - tail api+worker logs"
 up:
 	docker compose up -d --build
 down:
 	docker compose down
 build:
 	docker compose build api worker
 logs:
 	docker compose logs -f api worker
 init:
 	docker compose exec api python scripts/init_db.py
 	docker compose exec api python scripts/init_opensearch.py
 	docker compose exec api python scripts/init_qdrant.py
 smoke:
 	docker compose exec api python scripts/smoke_test.py
 test:
 	pytest -q
 fmt:
 	ruff format app scripts tests
 lint:
 	ruff check app scripts tests
--- a/README.md
+++ b/README.md
@@ -0,0 +1,233 @@
 # LegacyHUB - Knowledge Indexing & Hybrid Search for Legacy PDF Archives
 LegacyHUB is a production-oriented, fully open-source backend for ingesting,
 OCR-ing, structurally extracting, and hybrid-searching large legacy PDF
 archives (designed for ~70,000 documents).
 It is part of the **TeamHUB** suite.
 ```
 PDFs ──▶ Scanner ──▶ MinIO (originals)
                  └▶ OCRmyPDF (Tesseract) ──▶ MinIO (ocr_pdf)
                                          └▶ Docling ──▶ MD + JSON ──▶ MinIO
                                                       └▶ blocks/tables/figures
                                                                ├▶ PostgreSQL
                                                                ├▶ OpenSearch (BM25)
                                                                └▶ Qdrant (BGE-M3 dense)
                                                                          │
 FastAPI /search ◀── BGE Reranker ◀── RRF merge ◀───────────────────────────┘
 ```
 ## Stack
 | Component        | Tech                                     |
 |------------------|------------------------------------------|
 | OCR              | OCRmyPDF + Tesseract (rus + eng)         |
 | Extraction       | Docling (layout, tables, figures)        |
 | Object storage   | MinIO (S3-compatible)                    |
 | Relational store | PostgreSQL 16                            |
 | Lexical search   | OpenSearch 2.x (BM25 + ru/en analyzers)  |
 | Vector search    | Qdrant 1.x (named dense vector)          |
 | Embeddings       | BAAI/bge-m3 (dense, 1024d)               |
 | Reranker         | BAAI/bge-reranker-v2-m3                  |
 | API              | FastAPI + Uvicorn                        |
 | Workers          | Celery + Redis                           |
 | Logging          | structlog (JSON)                         |
 ## Quick start
 ```bash
 cp .env.example .env
 docker compose up -d --build
 docker compose exec api python scripts/init_db.py
 docker compose exec api python scripts/init_opensearch.py
 docker compose exec api python scripts/init_qdrant.py
 docker compose exec api python scripts/smoke_test.py
 ```
 Health check:
 ```bash
 curl http://localhost:8000/api/v1/health | jq .
 ```
 Open the interactive Swagger docs at <http://localhost:8000/docs>.
 ## Ingest documents
 Mount a folder into the container at `/data/input` (the compose file already
 mounts `./data/input` for you), drop PDFs into it, and call:
 ```bash
 curl -X POST http://localhost:8000/api/v1/ingest/folder \
  -H "Content-Type: application/json" \
  -d '{"path":"/data/input","recursive":true,"force":false}'
 ```
 Or run inline (no Celery, useful for ad-hoc tests):
 ```bash
 docker compose exec api python scripts/ingest_folder.py \
  --path /data/input --recursive --mode inline
 ```
 To re-process a single document by ID:
 ```bash
 docker compose exec api python scripts/reindex_document.py \
  --document-id <uuid>
 ```
 ## Search
 ```bash
 curl -X POST http://localhost:8000/api/v1/search \
  -H "Content-Type: application/json" \
  -d '{
        "query": "ГОСТ 21.501-93 рабочие чертежи",
        "limit": 10,
        "search_mode": "hybrid",
        "filters": {"min_ocr_confidence": 0.5}
      }' | jq .
 ```
 `search_mode` can be `lexical`, `semantic`, or `hybrid`. Hybrid mode does:
 1. BM25 top-K from OpenSearch
 2. Dense top-K from Qdrant (BGE-M3)
 3. Reciprocal Rank Fusion merge
 4. Top 30-50 candidates re-scored by the BGE reranker (if available)
 5. Final top-N returned with citation metadata
 Each hit includes the document name, page, block id, table/figure id where
 applicable, and quality flags - so AI consumers can produce verifiable answers
 with citations.
 ## Inspect the system
 | Service       | URL                                  | Credentials                |
 |---------------|--------------------------------------|----------------------------|
 | API docs      | <http://localhost:8000/docs>         | -                          |
 | MinIO console | <http://localhost:9001>              | `legacyhub` / `legacyhub-secret` |
 | OpenSearch    | <http://localhost:9200>              | -                          |
 | Qdrant UI     | <http://localhost:6333/dashboard>    | -                          |
 | Postgres      | `localhost:5432`                     | `legacyhub` / `legacyhub`  |
 ```bash
 # Count docs in OpenSearch
 curl 'http://localhost:9200/legacy_chunks/_count'
 # Inspect Qdrant collection
 curl 'http://localhost:6333/collections/legacy_chunks'
 # Browse Postgres
 docker compose exec postgres psql -U legacyhub -d legacyhub \
  -c "SELECT id, original_file_name, status FROM documents LIMIT 20;"
 ```
 ## Environment variables
 See [`.env.example`](.env.example) for the full list. Key ones:
 - `OCR_LANGUAGES` - Tesseract language packs (default `rus+eng`).
 - `OCR_ENABLED` - set `false` to skip OCR completely.
 - `DOCLING_OCR_ENABLED` - prefer OCRmyPDF; only enable if you do not run OCRmyPDF.
 - `EMBEDDING_DEVICE` / `RERANKER_DEVICE` - `cpu`, `cuda`, or `mps`.
 - `MAX_DOCUMENT_TIMEOUT_SECONDS` - per-document soft timeout for extraction.
 ## Handling poor OCR
 - The pipeline computes per-chunk `quality_flags`:
  - `low_ocr_confidence`, `very_short_text`, `possible_garbled_text`
  - `table_detected`, `figure_detected`, `handwriting_detected`
  - `needs_manual_review` (any of the above except table/figure detection)
 - Garbled chunks are still indexed - so they remain searchable - but the flags
  let you filter them out at query time via `filters.min_ocr_confidence`.
 - Original text is always preserved verbatim (no destructive cleaning); the
  `normalized_text` field is a derived form used purely for recall.
 - We deliberately preserve technical / legal identifiers (ГОСТ, document
  numbers, dates, serials, slashes, dashes, dots, brackets) during normalization.
 ## Handling handwriting
 - We do not attempt to recognize handwriting reliably. Suspected handwritten
  fragments are flagged with `block_type=handwriting` and
  `quality_flags.handwriting_detected=true` plus `needs_manual_review=true`.
 - The API does not present handwriting recognition output as authoritative.
 ## Idempotency
 - Document identity = SHA256 of the original PDF. Re-ingesting the same PDF
  reuses the existing `documents` row.
 - The pipeline deletes existing chunks for the document and re-creates them
  before re-indexing; OpenSearch and Qdrant entries are deleted-by-document
  before re-upsert. So re-running ingestion does not duplicate data.
 ## Failure handling
 - Each pipeline stage records a row in `processing_events` with `level` and
  `data` JSON.
 - A document that fails OCR is marked `OCR_FAILED` and the pipeline moves on.
 - A document that fails Docling is marked `EXTRACTION_FAILED`.
 - Indexing failures bring the document to `FAILED`; re-running
  `scripts/reindex_document.py` resumes processing.
 ## Scaling notes (~70k PDFs)
 - The Celery `worker` service is horizontally scalable: `docker compose up -d
  --scale worker=8` (or run several Compose stacks pointing at the same
  Postgres / MinIO / OpenSearch / Qdrant).
 - The embedding step is the biggest cost. Set `EMBEDDING_DEVICE=cuda` and a
  GPU-aware worker image if available.
 - OpenSearch defaults to 1 shard / 0 replicas - increase for production
  (`PUT /legacy_chunks/_settings`).
 - Qdrant is single-node by default; for very large corpora use the cluster
  build of Qdrant or shard by document hash.
 - For 70k PDFs at ~50 chunks each, expect ~3.5M vectors. BGE-M3 dense at 1024d
  is ~14 GB on disk; budget memory accordingly.
 ## Tests
 ```bash
 pip install -e ".[dev]"
 pytest -q
 ```
 The unit suite covers hashing, chunking, quality flags, hybrid result merging,
 and duplicate detection. Integration tests run against the live Compose stack
 via `scripts/smoke_test.py`.
 ## Repository layout
 ```
 legacy-knowledge-indexer/
  app/
    api/            # FastAPI routes & schemas
    db/             # SQLAlchemy models + Alembic migrations
    indexing/       # OpenSearch, Qdrant, embeddings, reranker, hybrid search
    ingestion/      # scanner, OCR, Docling, chunking, quality, pipeline
    storage/        # MinIO client + key conventions
    utils/          # hashing, text cleaning, language detection, PDF helpers
    workers/        # Celery app + tasks
  scripts/          # init / ingest / reindex / smoke
  tests/            # unit tests
  docker/Dockerfile # API + worker image
  docker-compose.yml
  .env.example
  pyproject.toml
  alembic.ini
 ```
 ## Known limitations
 - Docling's exact JSON shape varies between versions. The extractor uses
  defensive lookups and falls back to `paragraph` when a label is unknown.
 - We do not currently ship a sparse vector path (BGE-M3 supports it). Hybrid
  recall is achieved via OpenSearch BM25 + Qdrant dense, merged with RRF -
  which has been observed to outperform sparse-only or dense-only setups on
  noisy OCR.
 - Figure description does not invoke a VLM; captions plus a placeholder are
  used. Plug a VLM into `figure_processor.persist_figures` if needed.
 - No authentication on the API surface - put it behind your reverse proxy.
 ## License
 Apache-2.0.
--- a/RUNBOOK.md
+++ b/RUNBOOK.md
@@ -0,0 +1,146 @@
 # LegacyHUB — Operational Runbook
 ## Quick boot (dev)
 ```bash
 cp .env.example .env
 docker compose up -d --build
 docker compose exec api python scripts/init_db.py
 docker compose exec api python scripts/init_opensearch.py
 docker compose exec api python scripts/init_qdrant.py
 docker compose exec api python scripts/smoke_test.py
 ```
 Verify:
 ```bash
 curl -fsS http://localhost:8000/api/v1/health | jq .
 ```
 Frontend dev:
 ```bash
 cd frontend && cp .env.example .env && npm install && npm run dev
 # http://localhost:5273
 ```
 ## Production deploy
 Production overlay enables OpenSearch security plugin, removes default ports,
 forces externally-supplied credentials, and disables debug routes.
 ```bash
 # 1. Ensure secrets exist
 cp .env.prod.example .env.prod
 $EDITOR .env.prod          # rotate every credential, never commit
 # 2. Build + recreate
 docker compose \
  -f docker-compose.yml -f docker-compose.prod.yml \
  --env-file .env.prod \
  up -d --build --force-recreate api worker
 # 3. Migrations
 docker compose -f docker-compose.yml -f docker-compose.prod.yml \
  --env-file .env.prod exec api python scripts/init_db.py
 # 4. Health gate
 docker compose -f docker-compose.yml -f docker-compose.prod.yml \
  --env-file .env.prod exec api python scripts/smoke_test.py
 curl -fsS https://<host>/api/v1/health | jq -e '.status == "ok"'
 ```
 Hardening notes (mandatory for prod):
 - Rotate every credential in `.env.prod` from `.env.prod.example` placeholders.
 - Put OpenSearch behind TLS and admin password. Remove
  `DISABLE_SECURITY_PLUGIN=true` (handled by overlay).
 - Front the API with a reverse proxy that performs auth + TLS termination.
 - Restrict CORS via `CORS_ALLOWED_ORIGINS` (comma-separated) — never `*` in
  prod.
 - MinIO root key/secret in prod must come from a secret store, not the repo.
 - Mount `data/input` and `data/work` from durable storage, not the workstation.
 ## Ingestion
 ```bash
 # trigger from the API
 curl -X POST http://localhost:8000/api/v1/ingest/folder \
  -H "Content-Type: application/json" \
  -d '{"path":"/data/input","recursive":true,"force":false}'
 # or inline (no Celery)
 docker compose exec api python scripts/ingest_folder.py \
  --path /data/input --recursive --mode inline
 # re-index a single doc
 docker compose exec api python scripts/reindex_document.py \
  --document-id <uuid>
 ```
 ## Failure handling
 Each stage emits a row to `processing_events` with `level` and `data`. Inspect:
 ```bash
 docker compose exec postgres psql -U legacyhub -d legacyhub -c \
  "SELECT created_at, stage, level, message FROM processing_events
   ORDER BY created_at DESC LIMIT 50;"
 ```
 | Failure              | Where to look                                       | Fix                              |
 |----------------------|-----------------------------------------------------|----------------------------------|
 | `OCR_FAILED`         | `processing_events` → `OCR_STARTED` then error      | Confirm `tesseract-ocr-rus` package; rerun `scripts/reindex_document.py` |
 | `EXTRACTION_FAILED`  | `processing_events` → Docling stage                 | Check timeout; verify Docling version pin |
 | Indexing stuck       | OpenSearch + Qdrant health                          | `scripts/init_opensearch.py`, `scripts/init_qdrant.py` |
 | Reranker disabled    | API logs → `reranker.disabled`                      | Ensure `RERANKER_ENABLED=true`; HF cache mounted |
 ## Verification gates (per change)
 1. `python -m pytest tests/ -q` — full unit suite (19+ tests).
 2. `python -m compileall -q app scripts tests`.
 3. `docker compose config --quiet`.
 4. Frontend: `npx tsc --noEmit && npm run build`.
 5. `/api/v1/health` returns `{"status":"ok"}`.
 6. One smoke ingest of a known PDF; verify `/search` returns a result.
 ## Rollback
 1. Capture deployed commit SHA before deploy (`git rev-parse HEAD`).
 2. To roll back the API/worker image only:
   ```bash
   docker compose -f docker-compose.yml -f docker-compose.prod.yml \
     --env-file .env.prod up -d --build --force-recreate api worker \
     --no-deps  # keep PG/MinIO/OS/Qdrant intact
   ```
 3. Data services (PostgreSQL, MinIO, OpenSearch, Qdrant) are stateful and
   should not be rolled back casually. Restore from backup via the standard
   TeamHUB Suite backup runbook.
 ## Scaling notes (~70k PDFs)
 - Workers horizontally scale: `docker compose up -d --scale worker=8`.
 - Set `EMBEDDING_DEVICE=cuda` on a GPU-capable worker image for ~10× embedding
  throughput.
 - OpenSearch single shard suffices to ~10M chunks; increase shards and add
  replicas in prod.
 - Qdrant single-node OK for ~5M vectors; switch to cluster build beyond that.
 ## Common one-liners
 ```bash
 # count indexed chunks in OpenSearch
 curl 'http://localhost:9200/legacy_chunks/_count' | jq .
 # inspect Qdrant collection
 curl 'http://localhost:6333/collections/legacy_chunks' | jq .
 # list MinIO buckets
 docker compose exec minio mc alias set local http://localhost:9000 \
  "$MINIO_ACCESS_KEY" "$MINIO_SECRET_KEY"
 docker compose exec minio mc ls local
 # how many docs reached INDEXING_COMPLETED
 docker compose exec postgres psql -U legacyhub -d legacyhub -c \
  "SELECT status, COUNT(*) FROM documents GROUP BY status;"
 ```
--- a/alembic.ini
+++ b/alembic.ini
@@ -0,0 +1,40 @@
 [alembic]
 script_location = app/db/migrations
 prepend_sys_path = .
 sqlalchemy.url = driver://user:pass@host/dbname
 [post_write_hooks]
 [loggers]
 keys = root,sqlalchemy,alembic
 [handlers]
 keys = console
 [formatters]
 keys = generic
 [logger_root]
 level = WARN
 handlers = console
 qualname =
 [logger_sqlalchemy]
 level = WARN
 handlers =
 qualname = sqlalchemy.engine
 [logger_alembic]
 level = INFO
 handlers =
 qualname = alembic
 [handler_console]
 class = StreamHandler
 args = (sys.stderr,)
 level = NOTSET
 formatter = generic
 [formatter_generic]
 format = %(levelname)-5.5s [%(name)s] %(message)s
 datefmt = %H:%M:%S
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1,3 @@
 """LegacyHUB - knowledge indexing and hybrid search over legacy PDF archives."""
 __version__ = "0.1.0"
--- a/app/api/init.py
+++ b/app/api/init.py
--- a/app/api/routes_health.py
+++ b/app/api/routes_health.py
@@ -0,0 +1,96 @@
 """Health endpoint - probes Postgres, MinIO, OpenSearch, Qdrant, Redis."""
 from __future__ import annotations
 from typing import Any
 from fastapi import APIRouter
 from sqlalchemy import text
 from app import __version__
 from app.api.schemas import ComponentHealth, HealthResponse
 from app.config import settings
 from app.db.session import get_engine
 from app.logging_config import get_logger
 from app.storage.minio_client import get_storage
 logger = get_logger(__name__)
 router = APIRouter(tags=["health"])
 def _check_postgres() -> ComponentHealth:
    try:
        with get_engine().connect() as conn:
            conn.execute(text("SELECT 1"))
        return ComponentHealth(name="postgres", status="ok")
    except Exception as exc:  # noqa: BLE001
        return ComponentHealth(name="postgres", status="error", detail={"error": str(exc)})
 def _check_minio() -> ComponentHealth:
    info: dict[str, Any] = get_storage().health()
    if info.get("status") == "ok":
        return ComponentHealth(name="minio", status="ok", detail=info)
    return ComponentHealth(name="minio", status="error", detail=info)
 def _check_opensearch() -> ComponentHealth:
    try:
        from app.indexing.opensearch_client import get_opensearch
        client = get_opensearch()
        info = client.cluster.health()
        cluster_status = info.get("status")
        status = "ok" if cluster_status in ("green", "yellow") else "degraded"
        return ComponentHealth(
            name="opensearch",
            status=status,  # type: ignore[arg-type]
            detail={"cluster_status": cluster_status, "nodes": info.get("number_of_nodes")},
        )
    except Exception as exc:  # noqa: BLE001
        return ComponentHealth(name="opensearch", status="error", detail={"error": str(exc)})
 def _check_qdrant() -> ComponentHealth:
    try:
        from app.indexing.qdrant_client import get_qdrant
        client = get_qdrant()
        cols = client.get_collections()
        return ComponentHealth(
            name="qdrant",
            status="ok",
            detail={"collections": [c.name for c in cols.collections]},
        )
    except Exception as exc:  # noqa: BLE001
        return ComponentHealth(name="qdrant", status="error", detail={"error": str(exc)})
 def _check_redis() -> ComponentHealth:
    try:
        import redis
        r = redis.Redis.from_url(settings.redis_url, socket_connect_timeout=2)
        r.ping()
        return ComponentHealth(name="redis", status="ok")
    except Exception as exc:  # noqa: BLE001
        return ComponentHealth(name="redis", status="error", detail={"error": str(exc)})
@router.get("/health", response_model=HealthResponse)
 def health() -> HealthResponse:
    components = [
        _check_postgres(),
        _check_minio(),
        _check_opensearch(),
        _check_qdrant(),
        _check_redis(),
    ]
    if any(c.status == "error" for c in components):
        overall = "error"
    elif any(c.status == "degraded" for c in components):
        overall = "degraded"
    else:
        overall = "ok"
    return HealthResponse(status=overall, version=__version__, components=components)  # type: ignore[arg-type]
--- a/app/api/routes_ingestion.py
+++ b/app/api/routes_ingestion.py
@@ -0,0 +1,63 @@
 """Ingestion endpoints."""
 from __future__ import annotations
 import uuid
 from pathlib import Path
 from fastapi import APIRouter, HTTPException
 from app.api.schemas import IngestFolderRequest, IngestFolderResponse
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 router = APIRouter(prefix="/ingest", tags=["ingestion"])
@router.post("/folder", response_model=IngestFolderResponse)
 def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
    """Discover all PDFs under ``path`` and queue them for processing.
    The request returns immediately after the discovery pass. Per-document
    OCR / extraction / indexing happens asynchronously in Celery workers.
    """
    folder = Path(req.path)
    if not folder.exists() or not folder.is_dir():
        raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
    # Lazy import - keeps module load light.
    from app.ingestion.scanner import discover_documents
    from app.workers.tasks import process_document
    run_id = uuid.uuid4()
    discovered, queued, dups, invalid = 0, 0, 0, 0
    for record in discover_documents(folder, recursive=req.recursive, force=req.force):
        discovered += 1
        if record.duplicate and not req.force:
            dups += 1
            continue
        if not record.document_id:
            invalid += 1
            continue
        process_document.delay(str(record.document_id), str(run_id))
        queued += 1
    logger.info(
        "ingest.folder.queued",
        path=str(folder),
        discovered=discovered,
        queued=queued,
        skipped_duplicates=dups,
        invalid=invalid,
        run_id=str(run_id),
    )
    return IngestFolderResponse(
        run_id=run_id,
        discovered=discovered,
        queued=queued,
        skipped_duplicates=dups,
        invalid_files=invalid,
    )
--- a/app/api/routes_search.py
+++ b/app/api/routes_search.py
@@ -0,0 +1,16 @@
 """Search endpoint - lexical / semantic / hybrid."""
 from __future__ import annotations
 from fastapi import APIRouter
 from app.api.schemas import SearchRequest, SearchResponse
 router = APIRouter(prefix="/search", tags=["search"])
@router.post("", response_model=SearchResponse)
 def search(req: SearchRequest) -> SearchResponse:
    from app.indexing.hybrid_search import run_search
    return run_search(req)
--- a/app/api/schemas.py
+++ b/app/api/schemas.py
@@ -0,0 +1,99 @@
 """Pydantic request/response schemas for the LegacyHUB API."""
 from __future__ import annotations
 import uuid
 from datetime import datetime
 from typing import Any, Literal
 from pydantic import BaseModel, Field
 # ---------------- Health ----------------
 class ComponentHealth(BaseModel):
    name: str
    status: Literal["ok", "error", "degraded"]
    detail: dict[str, Any] = Field(default_factory=dict)
 class HealthResponse(BaseModel):
    status: Literal["ok", "error", "degraded"]
    version: str
    components: list[ComponentHealth]
 # ---------------- Ingestion ----------------
 class IngestFolderRequest(BaseModel):
    path: str = Field(..., description="Absolute path inside the API container")
    recursive: bool = True
    force: bool = False
 class IngestFolderResponse(BaseModel):
    run_id: uuid.UUID
    discovered: int
    queued: int
    skipped_duplicates: int
    invalid_files: int
 class DocumentSummary(BaseModel):
    id: uuid.UUID
    original_file_name: str
    source_path: str
    sha256: str
    status: str
    file_size_bytes: int
    created_at: datetime
 # ---------------- Search ----------------
 SearchMode = Literal["lexical", "semantic", "hybrid"]
 class SearchFilters(BaseModel):
    document_id: uuid.UUID | None = None
    source_path: str | None = None
    block_type: str | None = None
    min_ocr_confidence: float | None = Field(None, ge=0.0, le=1.0)
 class SearchRequest(BaseModel):
    query: str = Field(..., min_length=1)
    limit: int = Field(10, ge=1, le=100)
    filters: SearchFilters = Field(default_factory=SearchFilters)
    search_mode: SearchMode = "hybrid"
 class Citation(BaseModel):
    pdf: str
    page: int
    block_id: str | None = None
    table_id: str | None = None
    figure_id: str | None = None
 class SearchHit(BaseModel):
    rank: int
    score: float
    document_id: uuid.UUID
    chunk_id: uuid.UUID
    original_file_name: str
    source_path: str
    page_number: int
    block_type: str
    text: str
    citation: Citation
    quality_flags: dict[str, Any] = Field(default_factory=dict)
    metadata: dict[str, Any] = Field(default_factory=dict)
 class SearchResponse(BaseModel):
    query: str
    mode: SearchMode
    total_candidates: int
    reranked: bool
    results: list[SearchHit]
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,111 @@
 """Centralized typed configuration loaded from environment variables.
 All other modules import :data:`settings` and never touch ``os.environ`` directly.
 """
 from __future__ import annotations
 from functools import lru_cache
 from typing import Literal
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",
    )
    # ---------------- App ----------------
    app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
    app_host: str = Field("0.0.0.0", alias="APP_HOST")
    app_port: int = Field(8000, alias="APP_PORT")
    app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
    app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
    app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
    # ---------------- Postgres ----------------
    postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
    postgres_port: int = Field(5432, alias="POSTGRES_PORT")
    postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
    postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
    postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
    @property
    def database_url(self) -> str:
        return (
            f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
            f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
        )
    # ---------------- MinIO ----------------
    minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
    minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
    minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
    minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
    minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
    minio_secure: bool = Field(False, alias="MINIO_SECURE")
    minio_region: str = Field("us-east-1", alias="MINIO_REGION")
    # ---------------- OpenSearch ----------------
    opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
    opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
    opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
    opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
    opensearch_user: str = Field("", alias="OPENSEARCH_USER")
    opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
    opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
    # ---------------- Qdrant ----------------
    qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
    qdrant_port: int = Field(6333, alias="QDRANT_PORT")
    qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
    qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
    # ---------------- Redis ----------------
    redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
    # ---------------- OCR ----------------
    ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
    ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
    docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
    max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
    ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
    ocr_clean: bool = Field(True, alias="OCR_CLEAN")
    ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
    # ---------------- Embeddings / Reranker ----------------
    embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
    embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
    embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
    embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
    embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
    reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
    reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
    reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
    reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
    # ---------------- Chunking ----------------
    chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
    chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
    chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
    chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
    # ---------------- Hybrid search ----------------
    hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
    hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
    hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
    rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
@lru_cache(maxsize=1)
 def get_settings() -> Settings:
    return Settings()  # type: ignore[call-arg]
 settings = get_settings()
--- a/app/db/init.py
+++ b/app/db/init.py
@@ -0,0 +1,3 @@
 from app.db.models import Base
 __all__ = ["Base"]
--- a/app/db/migrations/env.py
+++ b/app/db/migrations/env.py
@@ -0,0 +1,55 @@
 """Alembic environment - online & offline migrations using app config."""
 from __future__ import annotations
 from logging.config import fileConfig
 from alembic import context
 from sqlalchemy import engine_from_config, pool
 from app.config import settings
 from app.db.models import Base
 config = context.config
 config.set_main_option("sqlalchemy.url", settings.database_url)
 if config.config_file_name is not None:
    fileConfig(config.config_file_name)
 target_metadata = Base.metadata
 def run_migrations_offline() -> None:
    context.configure(
        url=settings.database_url,
        target_metadata=target_metadata,
        literal_binds=True,
        dialect_opts={"paramstyle": "named"},
        compare_type=True,
    )
    with context.begin_transaction():
        context.run_migrations()
 def run_migrations_online() -> None:
    section = config.get_section(config.config_ini_section, {})
    section["sqlalchemy.url"] = settings.database_url
    connectable = engine_from_config(
        section,
        prefix="sqlalchemy.",
        poolclass=pool.NullPool,
    )
    with connectable.connect() as connection:
        context.configure(
            connection=connection,
            target_metadata=target_metadata,
            compare_type=True,
        )
        with context.begin_transaction():
            context.run_migrations()
 if context.is_offline_mode():
    run_migrations_offline()
 else:
    run_migrations_online()
--- a/app/db/migrations/script.py.mako
+++ b/app/db/migrations/script.py.mako
@@ -0,0 +1,27 @@
 """${message}
 Revision ID: ${up_revision}
 Revises: ${down_revision | comma,n}
 Create Date: ${create_date}
 """
 from __future__ import annotations
 from collections.abc import Sequence
 from alembic import op
 import sqlalchemy as sa
 ${imports if imports else ""}
 revision: str = ${repr(up_revision)}
 down_revision: str | None = ${repr(down_revision)}
 branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
 depends_on: str | Sequence[str] | None = ${repr(depends_on)}
 def upgrade() -> None:
    ${upgrades if upgrades else "pass"}
 def downgrade() -> None:
    ${downgrades if downgrades else "pass"}
--- a/app/db/migrations/versions/0001_initial.py
+++ b/app/db/migrations/versions/0001_initial.py
@@ -0,0 +1,171 @@
 """initial schema
 Revision ID: 0001_initial
 Revises:
 Create Date: 2026-05-10
 """
 from __future__ import annotations
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 revision: str = "0001_initial"
 down_revision: str | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    op.create_table(
        "documents",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("source_path", sa.Text, nullable=False),
        sa.Column("original_file_name", sa.Text, nullable=False),
        sa.Column("sha256", sa.String(64), nullable=False, unique=True),
        sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
        sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
        sa.Column("language_hint", sa.Text, nullable=True),
        sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
        sa.Column("error_message", sa.Text, nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
    )
    op.create_index("ix_documents_status", "documents", ["status"])
    op.create_index("ix_documents_sha256", "documents", ["sha256"])
    op.create_table(
        "document_artifacts",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
        sa.Column("artifact_type", sa.String(64), nullable=False),
        sa.Column("storage_bucket", sa.Text, nullable=False),
        sa.Column("storage_key", sa.Text, nullable=False),
        sa.Column("page_number", sa.Integer, nullable=True),
        sa.Column("checksum", sa.String(64), nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
    )
    op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
    op.create_table(
        "pages",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
        sa.Column("page_number", sa.Integer, nullable=False),
        sa.Column("text", sa.Text, nullable=False, server_default=""),
        sa.Column("ocr_confidence", sa.Float, nullable=True),
        sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
        sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
        sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
    )
    op.create_table(
        "chunks",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
        sa.Column("page_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
        sa.Column("page_number", sa.Integer, nullable=False),
        sa.Column("block_id", sa.Text, nullable=True),
        sa.Column("chunk_index", sa.Integer, nullable=False),
        sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
        sa.Column("text", sa.Text, nullable=False),
        sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
        sa.Column("token_count", sa.Integer, nullable=True),
        sa.Column("ocr_confidence", sa.Float, nullable=True),
        sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
        sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
    )
    op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
    op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
    op.create_table(
        "tables",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
        sa.Column("page_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
        sa.Column("page_number", sa.Integer, nullable=False),
        sa.Column("table_index", sa.Integer, nullable=False),
        sa.Column("markdown", sa.Text, nullable=False, server_default=""),
        sa.Column("csv_text", sa.Text, nullable=True),
        sa.Column("json_data", postgresql.JSONB, nullable=True),
        sa.Column("summary", sa.Text, nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
    )
    op.create_table(
        "figures",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
        sa.Column("page_id", postgresql.UUID(as_uuid=True),
                  sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
        sa.Column("page_number", sa.Integer, nullable=False),
        sa.Column("figure_index", sa.Integer, nullable=False),
        sa.Column("caption", sa.Text, nullable=True),
        sa.Column("description", sa.Text, nullable=True),
        sa.Column("storage_bucket", sa.Text, nullable=True),
        sa.Column("storage_key", sa.Text, nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
    )
    op.create_table(
        "ingestion_runs",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
        sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
        sa.Column("source_folder", sa.Text, nullable=False),
        sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
        sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
        sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
        sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
    )
    op.create_table(
        "processing_events",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column("stage", sa.String(64), nullable=False),
        sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
        sa.Column("message", sa.Text, nullable=False),
        sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
        sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
    )
    op.create_index("ix_events_doc", "processing_events", ["document_id"])
    op.create_index("ix_events_run", "processing_events", ["run_id"])
    op.create_index("ix_events_stage", "processing_events", ["stage"])
 def downgrade() -> None:
    op.drop_index("ix_events_stage", table_name="processing_events")
    op.drop_index("ix_events_run", table_name="processing_events")
    op.drop_index("ix_events_doc", table_name="processing_events")
    op.drop_table("processing_events")
    op.drop_table("ingestion_runs")
    op.drop_table("figures")
    op.drop_table("tables")
    op.drop_index("ix_chunks_block_type", table_name="chunks")
    op.drop_index("ix_chunks_doc_page", table_name="chunks")
    op.drop_table("chunks")
    op.drop_table("pages")
    op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
    op.drop_table("document_artifacts")
    op.drop_index("ix_documents_sha256", table_name="documents")
    op.drop_index("ix_documents_status", table_name="documents")
    op.drop_table("documents")
--- a/app/db/models.py
+++ b/app/db/models.py
@@ -0,0 +1,266 @@
 """SQLAlchemy ORM models for LegacyHUB."""
 from __future__ import annotations
 import uuid
 from datetime import datetime
 from typing import Any
 from sqlalchemy import (
    BigInteger,
    Boolean,
    DateTime,
    Float,
    ForeignKey,
    Index,
    Integer,
    String,
    Text,
    UniqueConstraint,
    func,
 )
 from sqlalchemy.dialects.postgresql import JSONB, UUID
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
 class Base(DeclarativeBase):
    pass
 # ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
 class DocumentStatus:
    DISCOVERED = "DISCOVERED"
    STORED_ORIGINAL = "STORED_ORIGINAL"
    OCR_STARTED = "OCR_STARTED"
    OCR_COMPLETED = "OCR_COMPLETED"
    OCR_FAILED = "OCR_FAILED"
    EXTRACTION_STARTED = "EXTRACTION_STARTED"
    EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
    EXTRACTION_FAILED = "EXTRACTION_FAILED"
    CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
    INDEXING_COMPLETED = "INDEXING_COMPLETED"
    FAILED = "FAILED"
 class ArtifactType:
    ORIGINAL_PDF = "original_pdf"
    OCR_PDF = "ocr_pdf"
    DOCLING_JSON = "docling_json"
    MARKDOWN = "markdown"
    PAGE_IMAGE = "page_image"
    FIGURE_CROP = "figure_crop"
    TABLE_JSON = "table_json"
 class BlockType:
    TITLE = "title"
    HEADING = "heading"
    PARAGRAPH = "paragraph"
    LIST = "list"
    TABLE = "table"
    FIGURE_CAPTION = "figure_caption"
    FIGURE_DESCRIPTION = "figure_description"
    HANDWRITING = "handwriting"
    UNKNOWN = "unknown"
 # ---- Tables ----
 class Document(Base):
    __tablename__ = "documents"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    source_path: Mapped[str] = mapped_column(Text, nullable=False)
    original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
    sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
    file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
    mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
    language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
    status: Mapped[str] = mapped_column(
        String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
    )
    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
    )
    artifacts: Mapped[list[DocumentArtifact]] = relationship(
        back_populates="document", cascade="all, delete-orphan"
    )
    pages: Mapped[list[Page]] = relationship(
        back_populates="document", cascade="all, delete-orphan"
    )
    chunks: Mapped[list[Chunk]] = relationship(
        back_populates="document", cascade="all, delete-orphan"
    )
 class DocumentArtifact(Base):
    __tablename__ = "document_artifacts"
    __table_args__ = (
        Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
    )
    artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
    storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
    storage_key: Mapped[str] = mapped_column(Text, nullable=False)
    page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
    checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    document: Mapped[Document] = relationship(back_populates="artifacts")
 class Page(Base):
    __tablename__ = "pages"
    __table_args__ = (
        UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
    )
    page_number: Mapped[int] = mapped_column(Integer, nullable=False)
    text: Mapped[str] = mapped_column(Text, nullable=False, default="")
    ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
    has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    document: Mapped[Document] = relationship(back_populates="pages")
    chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
 class Chunk(Base):
    __tablename__ = "chunks"
    __table_args__ = (
        UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
        Index("ix_chunks_doc_page", "document_id", "page_number"),
        Index("ix_chunks_block_type", "block_type"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
    )
    page_id: Mapped[uuid.UUID | None] = mapped_column(
        UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
    )
    page_number: Mapped[int] = mapped_column(Integer, nullable=False)
    block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
    chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
    block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
    text: Mapped[str] = mapped_column(Text, nullable=False)
    normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
    token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
    ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
    quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
    chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
        "metadata", JSONB, nullable=False, default=dict
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    document: Mapped[Document] = relationship(back_populates="chunks")
    page: Mapped[Page | None] = relationship(back_populates="chunks")
 class Table(Base):
    __tablename__ = "tables"
    __table_args__ = (
        UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
    )
    page_id: Mapped[uuid.UUID | None] = mapped_column(
        UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
    )
    page_number: Mapped[int] = mapped_column(Integer, nullable=False)
    table_index: Mapped[int] = mapped_column(Integer, nullable=False)
    markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
    csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
    json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
    summary: Mapped[str | None] = mapped_column(Text, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
 class Figure(Base):
    __tablename__ = "figures"
    __table_args__ = (
        UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
    )
    page_id: Mapped[uuid.UUID | None] = mapped_column(
        UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
    )
    page_number: Mapped[int] = mapped_column(Integer, nullable=False)
    figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
    caption: Mapped[str | None] = mapped_column(Text, nullable=True)
    description: Mapped[str | None] = mapped_column(Text, nullable=True)
    storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
    storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
 class IngestionRun(Base):
    __tablename__ = "ingestion_runs"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    started_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
    status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
    source_folder: Mapped[str] = mapped_column(Text, nullable=False)
    total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    run_metadata: Mapped[dict[str, Any]] = mapped_column(
        "metadata", JSONB, nullable=False, default=dict
    )
 class ProcessingEvent(Base):
    __tablename__ = "processing_events"
    __table_args__ = (
        Index("ix_events_doc", "document_id"),
        Index("ix_events_run", "run_id"),
        Index("ix_events_stage", "stage"),
    )
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
    document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
    stage: Mapped[str] = mapped_column(String(64), nullable=False)
    level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
    message: Mapped[str] = mapped_column(Text, nullable=False)
    data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
--- a/app/db/session.py
+++ b/app/db/session.py
@@ -0,0 +1,66 @@
 """SQLAlchemy engine and session factory."""
 from __future__ import annotations
 from collections.abc import Iterator
 from contextlib import contextmanager
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import Session, sessionmaker
 from app.config import settings
 _engine: Engine | None = None
 _SessionFactory: sessionmaker[Session] | None = None
 def get_engine() -> Engine:
    global _engine
    if _engine is None:
        _engine = create_engine(
            settings.database_url,
            pool_pre_ping=True,
            pool_size=10,
            max_overflow=20,
            future=True,
        )
    return _engine
 def get_session_factory() -> sessionmaker[Session]:
    global _SessionFactory
    if _SessionFactory is None:
        _SessionFactory = sessionmaker(
            bind=get_engine(),
            autoflush=False,
            autocommit=False,
            expire_on_commit=False,
            future=True,
        )
    return _SessionFactory
@contextmanager
 def session_scope() -> Iterator[Session]:
    """Provide a transactional scope: commits on success, rolls back on error."""
    factory = get_session_factory()
    session = factory()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
 def get_db() -> Iterator[Session]:
    """FastAPI dependency."""
    factory = get_session_factory()
    session = factory()
    try:
        yield session
    finally:
        session.close()
--- a/app/indexing/init.py
+++ b/app/indexing/init.py
--- a/app/indexing/embeddings.py
+++ b/app/indexing/embeddings.py
@@ -0,0 +1,90 @@
 """BGE-M3 dense embedder with batching and CPU/GPU support.
 We prefer FlagEmbedding's ``BGEM3FlagModel`` because it is the canonical
 implementation and supports dense + sparse output. We fall back to
 ``sentence-transformers`` for portability.
 """
 from __future__ import annotations
 from functools import lru_cache
 from typing import Sequence
 import numpy as np
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 class Embedder:
    def __init__(self, model_name: str, device: str, normalize: bool, batch_size: int) -> None:
        self.model_name = model_name
        self.device = device
        self.normalize = normalize
        self.batch_size = batch_size
        self._impl = "flagembedding"
        self._model = None
        self._st_model = None
        self._load()
    def _load(self) -> None:
        try:
            from FlagEmbedding import BGEM3FlagModel  # type: ignore
            use_fp16 = self.device != "cpu"
            self._model = BGEM3FlagModel(self.model_name, use_fp16=use_fp16, devices=self.device)
            self._impl = "flagembedding"
            logger.info("embedder.loaded", impl="flagembedding", model=self.model_name, device=self.device)
            return
        except Exception as exc:  # noqa: BLE001
            logger.warning("embedder.flagembedding_failed", error=str(exc))
        from sentence_transformers import SentenceTransformer
        self._st_model = SentenceTransformer(self.model_name, device=self.device)
        self._impl = "sentence-transformers"
        logger.info("embedder.loaded", impl="sentence-transformers", model=self.model_name, device=self.device)
    def encode(self, texts: Sequence[str]) -> list[list[float]]:
        if not texts:
            return []
        if self._impl == "flagembedding":
            out = self._model.encode(  # type: ignore[union-attr]
                list(texts),
                batch_size=self.batch_size,
                max_length=8192,
                return_dense=True,
                return_sparse=False,
                return_colbert_vecs=False,
            )
            dense = out["dense_vecs"] if isinstance(out, dict) else out
            arr = np.asarray(dense, dtype=np.float32)
        else:
            arr = self._st_model.encode(  # type: ignore[union-attr]
                list(texts),
                batch_size=self.batch_size,
                normalize_embeddings=self.normalize,
                convert_to_numpy=True,
                show_progress_bar=False,
            )
            arr = arr.astype(np.float32)
        if self.normalize and self._impl == "flagembedding":
            norms = np.linalg.norm(arr, axis=1, keepdims=True)
            norms[norms == 0] = 1.0
            arr = arr / norms
        return arr.tolist()
    def encode_one(self, text: str) -> list[float]:
        return self.encode([text])[0]
@lru_cache(maxsize=1)
 def get_embedder() -> Embedder:
    return Embedder(
        model_name=settings.embedding_model,
        device=settings.embedding_device,
        normalize=settings.embedding_normalize,
        batch_size=settings.embedding_batch_size,
    )
--- a/app/indexing/hybrid_search.py
+++ b/app/indexing/hybrid_search.py
@@ -0,0 +1,327 @@
 """Hybrid search: lexical (OpenSearch BM25) + semantic (Qdrant) + RRF + reranker.
 Always returns ``SearchResponse`` (never throws on missing index/collection -
 empty results are valid).
 """
 from __future__ import annotations
 import uuid
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any
 from qdrant_client.http import models as qm
 from app.api.schemas import (
    Citation,
    SearchFilters,
    SearchHit,
    SearchMode,
    SearchRequest,
    SearchResponse,
 )
 from app.config import settings
 from app.indexing.embeddings import get_embedder
 from app.indexing.opensearch_client import get_opensearch
 from app.indexing.qdrant_client import DENSE_VECTOR_NAME, get_qdrant
 from app.indexing.reranker import get_reranker
 from app.logging_config import get_logger
 from app.utils.text_cleaning import normalize_for_search
 logger = get_logger(__name__)
@dataclass
 class _Candidate:
    chunk_id: str
    document_id: str
    page_number: int
    block_type: str
    block_id: str | None
    text: str
    source_path: str
    original_file_name: str
    quality_flags: dict[str, Any]
    metadata: dict[str, Any]
    bm25_score: float | None = None
    bm25_rank: int | None = None
    dense_score: float | None = None
    dense_rank: int | None = None
 def run_search(req: SearchRequest) -> SearchResponse:
    mode: SearchMode = req.search_mode
    filters = req.filters
    lexical: list[_Candidate] = []
    semantic: list[_Candidate] = []
    if mode in ("lexical", "hybrid"):
        try:
            lexical = _lexical_search(req.query, filters, settings.hybrid_opensearch_top_k)
        except Exception as exc:  # noqa: BLE001
            logger.warning("search.lexical_failed", error=str(exc))
    if mode in ("semantic", "hybrid"):
        try:
            semantic = _semantic_search(req.query, filters, settings.hybrid_qdrant_top_k)
        except Exception as exc:  # noqa: BLE001
            logger.warning("search.semantic_failed", error=str(exc))
    merged = _merge(lexical, semantic, mode)
    candidates = merged[: settings.rerank_candidates]
    reranker = get_reranker()
    reranked_flag = False
    if settings.reranker_enabled and reranker.available and candidates:
        scores = reranker.score(req.query, [c.text for c in candidates])
        for c, s in zip(candidates, scores, strict=True):
            c.dense_score = s
        candidates.sort(key=lambda c: (c.dense_score or 0.0), reverse=True)
        reranked_flag = True
    final = candidates[: req.limit]
    hits: list[SearchHit] = []
    for rank, c in enumerate(final, start=1):
        score = (
            c.dense_score
            if reranked_flag
            else (c.dense_score if mode == "semantic" else c.bm25_score) or 0.0
        )
        hits.append(
            SearchHit(
                rank=rank,
                score=float(score),
                document_id=uuid.UUID(c.document_id),
                chunk_id=uuid.UUID(c.chunk_id),
                original_file_name=c.original_file_name,
                source_path=c.source_path,
                page_number=c.page_number,
                block_type=c.block_type,
                text=c.text,
                citation=Citation(
                    pdf=c.original_file_name,
                    page=c.page_number,
                    block_id=c.block_id,
                    table_id=str(c.metadata.get("table_index")) if c.metadata.get("table_index") is not None else None,
                    figure_id=str(c.metadata.get("figure_index")) if c.metadata.get("figure_index") is not None else None,
                ),
                quality_flags=c.quality_flags,
                metadata=c.metadata,
            )
        )
    return SearchResponse(
        query=req.query,
        mode=mode,
        total_candidates=len(merged),
        reranked=reranked_flag,
        results=hits,
    )
 # ---------------- lexical ----------------
 def _lexical_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
    client = get_opensearch()
    if not client.indices.exists(index=settings.opensearch_index_chunks):
        return []
    must = [
        {
            "multi_match": {
                "query": query,
                "fields": ["text^1.0", "text.ru^1.5", "text.en^1.5", "normalized_text^0.7"],
                "type": "best_fields",
                "operator": "or",
            }
        }
    ]
    norm = normalize_for_search(query)
    if norm and norm != query.lower():
        must.append({"match": {"normalized_text": {"query": norm, "boost": 0.5}}})
    filter_clauses = _opensearch_filters(filters)
    body = {
        "size": top_k,
        "query": {"bool": {"must": must, "filter": filter_clauses}},
        "_source": [
            "chunk_id",
            "document_id",
            "source_path",
            "original_file_name",
            "page_number",
            "block_type",
            "block_id",
            "text",
            "quality_flags",
            "metadata",
        ],
    }
    res = client.search(index=settings.opensearch_index_chunks, body=body, request_timeout=30)
    out: list[_Candidate] = []
    for rank, hit in enumerate(res.get("hits", {}).get("hits", []), start=1):
        s = hit.get("_source", {})
        out.append(
            _Candidate(
                chunk_id=s["chunk_id"],
                document_id=s["document_id"],
                page_number=int(s.get("page_number", 0)),
                block_type=s.get("block_type", "paragraph"),
                block_id=s.get("block_id"),
                text=s.get("text", ""),
                source_path=s.get("source_path", ""),
                original_file_name=s.get("original_file_name", ""),
                quality_flags=s.get("quality_flags") or {},
                metadata=s.get("metadata") or {},
                bm25_score=float(hit.get("_score") or 0.0),
                bm25_rank=rank,
            )
        )
    return out
 def _opensearch_filters(filters: SearchFilters) -> list[dict[str, Any]]:
    clauses: list[dict[str, Any]] = []
    if filters.document_id:
        clauses.append({"term": {"document_id": str(filters.document_id)}})
    if filters.source_path:
        clauses.append({"term": {"source_path": filters.source_path}})
    if filters.block_type:
        clauses.append({"term": {"block_type": filters.block_type}})
    if filters.min_ocr_confidence is not None:
        clauses.append({"range": {"ocr_confidence": {"gte": filters.min_ocr_confidence}}})
    return clauses
 # ---------------- semantic ----------------
 def _semantic_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
    embedder = get_embedder()
    vector = embedder.encode_one(query)
    qf = _qdrant_filter(filters)
    client = get_qdrant()
    try:
        results = client.query_points(
            collection_name=settings.qdrant_collection_chunks,
            query=vector,
            using=DENSE_VECTOR_NAME,
            limit=top_k,
            with_payload=True,
            query_filter=qf,
        ).points
    except Exception as exc:  # noqa: BLE001
        logger.debug("qdrant.query_points_fallback", error=str(exc))
        results = client.search(
            collection_name=settings.qdrant_collection_chunks,
            query_vector=(DENSE_VECTOR_NAME, vector),
            query_filter=qf,
            limit=top_k,
            with_payload=True,
        )
    out: list[_Candidate] = []
    for rank, p in enumerate(results, start=1):
        payload = p.payload or {}
        chunk_id = payload.get("chunk_id") or str(p.id)
        out.append(
            _Candidate(
                chunk_id=str(chunk_id),
                document_id=str(payload.get("document_id", "")),
                page_number=int(payload.get("page_number") or 0),
                block_type=payload.get("block_type", "paragraph"),
                block_id=payload.get("block_id"),
                text=payload.get("text_preview", ""),
                source_path=payload.get("source_path", ""),
                original_file_name=payload.get("original_file_name", ""),
                quality_flags=payload.get("quality_flags") or {},
                metadata=payload.get("metadata") or {},
                dense_score=float(p.score or 0.0),
                dense_rank=rank,
            )
        )
    return out
 def _qdrant_filter(filters: SearchFilters) -> qm.Filter | None:
    must: list[qm.FieldCondition | qm.Range] = []
    if filters.document_id:
        must.append(qm.FieldCondition(key="document_id", match=qm.MatchValue(value=str(filters.document_id))))
    if filters.source_path:
        must.append(qm.FieldCondition(key="source_path", match=qm.MatchValue(value=filters.source_path)))
    if filters.block_type:
        must.append(qm.FieldCondition(key="block_type", match=qm.MatchValue(value=filters.block_type)))
    if filters.min_ocr_confidence is not None:
        must.append(qm.FieldCondition(key="ocr_confidence", range=qm.Range(gte=filters.min_ocr_confidence)))
    if not must:
        return None
    return qm.Filter(must=must)
 # ---------------- merge ----------------
 def _merge(lexical: list[_Candidate], semantic: list[_Candidate], mode: SearchMode) -> list[_Candidate]:
    if mode == "lexical":
        return lexical
    if mode == "semantic":
        return _hydrate_semantic_text(semantic)
    by_id: dict[str, _Candidate] = {}
    for c in lexical:
        by_id[c.chunk_id] = c
    for c in semantic:
        if c.chunk_id in by_id:
            by_id[c.chunk_id].dense_score = c.dense_score
            by_id[c.chunk_id].dense_rank = c.dense_rank
            if not by_id[c.chunk_id].text:
                by_id[c.chunk_id].text = c.text
        else:
            by_id[c.chunk_id] = c
    rrf: dict[str, float] = defaultdict(float)
    k = settings.hybrid_rrf_k
    for c in lexical:
        if c.bm25_rank is not None:
            rrf[c.chunk_id] += 1.0 / (k + c.bm25_rank)
    for c in semantic:
        if c.dense_rank is not None:
            rrf[c.chunk_id] += 1.0 / (k + c.dense_rank)
    items = sorted(by_id.values(), key=lambda c: rrf.get(c.chunk_id, 0.0), reverse=True)
    return _hydrate_full_text(items)
 def _hydrate_full_text(candidates: list[_Candidate]) -> list[_Candidate]:
    """For candidates whose text came only from Qdrant payload (preview), pull
    the full chunk text from OpenSearch by id so the reranker sees full content.
    """
    missing = [c for c in candidates if len(c.text) <= 512]
    if not missing:
        return candidates
    client = get_opensearch()
    ids = [c.chunk_id for c in missing]
    try:
        res = client.mget(index=settings.opensearch_index_chunks, body={"ids": ids})
    except Exception:
        return candidates
    by_id = {d["_id"]: d.get("_source", {}) for d in res.get("docs", []) if d.get("found")}
    for c in missing:
        s = by_id.get(c.chunk_id)
        if s and s.get("text"):
            c.text = s["text"]
            if not c.original_file_name:
                c.original_file_name = s.get("original_file_name", "")
            if not c.source_path:
                c.source_path = s.get("source_path", "")
            if not c.metadata:
                c.metadata = s.get("metadata") or {}
            if not c.quality_flags:
                c.quality_flags = s.get("quality_flags") or {}
    return candidates
 def _hydrate_semantic_text(candidates: list[_Candidate]) -> list[_Candidate]:
    return _hydrate_full_text(candidates)
--- a/app/indexing/opensearch_client.py
+++ b/app/indexing/opensearch_client.py
@@ -0,0 +1,142 @@
 """OpenSearch client + index bootstrap + chunk indexing helpers."""
 from __future__ import annotations
 from functools import lru_cache
 from typing import Any, Iterable
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from opensearchpy.helpers import bulk
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 # Index settings: 3 analyzers (russian, english, standard).
 # We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
 INDEX_SETTINGS: dict[str, Any] = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "filter": {
                "ru_stop": {"type": "stop", "stopwords": "_russian_"},
                "ru_stemmer": {"type": "stemmer", "language": "russian"},
                "en_stop": {"type": "stop", "stopwords": "_english_"},
                "en_stemmer": {"type": "stemmer", "language": "english"},
            },
            "analyzer": {
                "ru_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "ru_stop", "ru_stemmer"],
                },
                "en_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "en_stop", "en_stemmer"],
                },
                "code_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase"],
                },
            },
        },
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "chunk_id": {"type": "keyword"},
            "document_id": {"type": "keyword"},
            "source_path": {"type": "keyword"},
            "original_file_name": {
                "type": "text",
                "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
            },
            "page_number": {"type": "integer"},
            "block_type": {"type": "keyword"},
            "block_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "code_analyzer",
                "fields": {
                    "ru": {"type": "text", "analyzer": "ru_analyzer"},
                    "en": {"type": "text", "analyzer": "en_analyzer"},
                },
            },
            "normalized_text": {
                "type": "text",
                "analyzer": "code_analyzer",
            },
            "ocr_confidence": {"type": "float"},
            "language_hint": {"type": "keyword"},
            "metadata": {"type": "object", "enabled": True},
            "quality_flags": {"type": "object", "enabled": True},
            "created_at": {"type": "date"},
        },
    },
 }
@lru_cache(maxsize=1)
 def get_opensearch() -> OpenSearch:
    auth = None
    if settings.opensearch_user and settings.opensearch_password:
        auth = (settings.opensearch_user, settings.opensearch_password)
    return OpenSearch(
        hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
        http_auth=auth,
        use_ssl=settings.opensearch_use_ssl,
        verify_certs=settings.opensearch_verify_certs,
        ssl_show_warn=False,
        connection_class=RequestsHttpConnection,
        timeout=30,
        max_retries=3,
        retry_on_timeout=True,
    )
 def ensure_index(index: str | None = None) -> None:
    name = index or settings.opensearch_index_chunks
    client = get_opensearch()
    if client.indices.exists(index=name):
        logger.debug("opensearch.index.exists", index=name)
        return
    logger.info("opensearch.index.create", index=name)
    client.indices.create(index=name, body=INDEX_SETTINGS)
 def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
    """Bulk-upsert chunks. Returns (success, errors)."""
    name = index or settings.opensearch_index_chunks
    actions: list[dict[str, Any]] = []
    for d in docs:
        actions.append(
            {
                "_op_type": "index",
                "_index": name,
                "_id": d["chunk_id"],
                "_source": d,
            }
        )
    if not actions:
        return 0, 0
    success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
    if errors:
        logger.warning("opensearch.bulk.errors", count=len(errors))
    return success, len(errors) if isinstance(errors, list) else 0
 def delete_by_document(document_id: str, index: str | None = None) -> int:
    name = index or settings.opensearch_index_chunks
    client = get_opensearch()
    if not client.indices.exists(index=name):
        return 0
    res = client.delete_by_query(
        index=name,
        body={"query": {"term": {"document_id": document_id}}},
        refresh=True,
    )
    return int(res.get("deleted", 0))
--- a/app/indexing/qdrant_client.py
+++ b/app/indexing/qdrant_client.py
@@ -0,0 +1,103 @@
 """Qdrant client + collection bootstrap + chunk upsert."""
 from __future__ import annotations
 from functools import lru_cache
 from typing import Any, Sequence
 from qdrant_client import QdrantClient
 from qdrant_client.http import models as qm
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 DENSE_VECTOR_NAME = "dense"
@lru_cache(maxsize=1)
 def get_qdrant() -> QdrantClient:
    return QdrantClient(
        host=settings.qdrant_host,
        port=settings.qdrant_port,
        api_key=settings.qdrant_api_key or None,
        timeout=60,
    )
 def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
    name = collection or settings.qdrant_collection_chunks
    vector_size = dim or settings.embedding_dim
    client = get_qdrant()
    existing = {c.name for c in client.get_collections().collections}
    if name in existing:
        logger.debug("qdrant.collection.exists", collection=name)
        return
    logger.info("qdrant.collection.create", collection=name, dim=vector_size)
    client.create_collection(
        collection_name=name,
        vectors_config={
            DENSE_VECTOR_NAME: qm.VectorParams(
                size=vector_size,
                distance=qm.Distance.COSINE,
            )
        },
        optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
    )
    # Payload indexes for filtering.
    for field in ("document_id", "source_path", "block_type"):
        client.create_payload_index(
            collection_name=name,
            field_name=field,
            field_schema=qm.PayloadSchemaType.KEYWORD,
        )
    client.create_payload_index(
        collection_name=name,
        field_name="page_number",
        field_schema=qm.PayloadSchemaType.INTEGER,
    )
    client.create_payload_index(
        collection_name=name,
        field_name="ocr_confidence",
        field_schema=qm.PayloadSchemaType.FLOAT,
    )
 def upsert_chunks(
    points: Sequence[tuple[str, list[float], dict[str, Any]]],
    collection: str | None = None,
 ) -> int:
    """Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
    name = collection or settings.qdrant_collection_chunks
    if not points:
        return 0
    qpoints = [
        qm.PointStruct(
            id=_qid(chunk_id),
            vector={DENSE_VECTOR_NAME: vector},
            payload={**payload, "chunk_id": chunk_id},
        )
        for chunk_id, vector, payload in points
    ]
    get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
    return len(qpoints)
 def delete_by_document(document_id: str, collection: str | None = None) -> int:
    name = collection or settings.qdrant_collection_chunks
    client = get_qdrant()
    client.delete(
        collection_name=name,
        points_selector=qm.FilterSelector(
            filter=qm.Filter(
                must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
            )
        ),
    )
    return 1
 def _qid(chunk_id: str) -> str:
    """Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
    return chunk_id
--- a/app/indexing/reranker.py
+++ b/app/indexing/reranker.py
@@ -0,0 +1,75 @@
 """BGE reranker - cross-encoder style scoring of (query, passage) pairs.
 Designed to degrade gracefully:
 - If the model fails to load, ``rerank`` returns inputs unchanged with the
  ``reranked`` flag set to False so the API can report the truth to clients.
 """
 from __future__ import annotations
 from functools import lru_cache
 from typing import Sequence
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 class Reranker:
    def __init__(self, model_name: str, device: str, batch_size: int) -> None:
        self.model_name = model_name
        self.device = device
        self.batch_size = batch_size
        self._impl: str | None = None
        self._model = None
        self._load()
    def _load(self) -> None:
        try:
            from FlagEmbedding import FlagReranker  # type: ignore
            use_fp16 = self.device != "cpu"
            self._model = FlagReranker(self.model_name, use_fp16=use_fp16, devices=self.device)
            self._impl = "flagembedding"
            logger.info("reranker.loaded", impl="flagembedding", model=self.model_name, device=self.device)
            return
        except Exception as exc:  # noqa: BLE001
            logger.warning("reranker.flagembedding_failed", error=str(exc))
        try:
            from sentence_transformers import CrossEncoder
            self._model = CrossEncoder(self.model_name, device=self.device)
            self._impl = "sentence-transformers"
            logger.info("reranker.loaded", impl="sentence-transformers", model=self.model_name)
        except Exception as exc:  # noqa: BLE001
            logger.error("reranker.disabled", error=str(exc))
            self._impl = None
            self._model = None
    @property
    def available(self) -> bool:
        return self._impl is not None and self._model is not None
    def score(self, query: str, passages: Sequence[str]) -> list[float]:
        if not self.available or not passages:
            return [0.0] * len(passages)
        pairs = [(query, p) for p in passages]
        if self._impl == "flagembedding":
            scores = self._model.compute_score(pairs, batch_size=self.batch_size, normalize=True)  # type: ignore[union-attr]
        else:
            scores = self._model.predict(pairs, batch_size=self.batch_size)  # type: ignore[union-attr]
        if not isinstance(scores, list):
            try:
                scores = list(scores)
            except TypeError:
                scores = [float(scores)]
        return [float(s) for s in scores]
@lru_cache(maxsize=1)
 def get_reranker() -> Reranker:
    return Reranker(
        model_name=settings.reranker_model,
        device=settings.reranker_device,
        batch_size=settings.reranker_batch_size,
    )
--- a/app/ingestion/init.py
+++ b/app/ingestion/init.py
--- a/app/ingestion/chunker.py
+++ b/app/ingestion/chunker.py
@@ -0,0 +1,317 @@
 """Structure-aware chunking.
 Rules (per spec):
 - Chunk by document structure first, fixed-size second.
 - Hierarchy: title > heading > paragraph > list > table > figure caption.
 - Target 500-900 tokens (configurable).
 - Overlap 80-120 tokens for long narrative text only.
 - Never split tables - one table = one chunk (or one chunk per row group if huge).
 - Every chunk carries citation metadata.
 We use a deliberately simple ``len(text.split())`` token estimator. The downstream
 embedding model has its own tokenizer; this estimator is only a budget proxy.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any
 from app.config import settings
 from app.ingestion.docling_extractor import (
    ExtractedBlock,
    ExtractedFigure,
    ExtractedTable,
    ExtractionResult,
 )
 from app.ingestion.normalizer import normalize_block
 from app.ingestion.quality import compute_quality_flags
@dataclass
 class ChunkRecord:
    chunk_index: int
    page_number: int
    block_type: str
    text: str
    normalized_text: str
    token_count: int
    block_id: str | None = None
    quality_flags: dict[str, Any] = field(default_factory=dict)
    metadata: dict[str, Any] = field(default_factory=dict)
 def _estimate_tokens(text: str) -> int:
    return max(1, len(text.split()))
 def chunk_extraction(
    extraction: ExtractionResult,
    *,
    document_ocr_confidence: float | None = None,
 ) -> list[ChunkRecord]:
    target = settings.chunk_target_tokens
    minimum = settings.chunk_min_tokens
    maximum = settings.chunk_max_tokens
    overlap = settings.chunk_overlap_tokens
    chunks: list[ChunkRecord] = []
    idx = 0
    # 1) Tables first - one chunk per table, never split.
    for t in extraction.tables:
        body = (t.markdown or "").strip()
        if not body:
            continue
        summary = _summarize_table(t)
        text = body
        if summary:
            text = f"{summary}\n\n{body}"
        display, norm = normalize_block(text)
        flags = compute_quality_flags(
            text=display,
            block_type="table",
            ocr_confidence=document_ocr_confidence,
        )
        chunks.append(
            ChunkRecord(
                chunk_index=idx,
                page_number=t.page_number,
                block_type="table",
                text=display,
                normalized_text=norm,
                token_count=_estimate_tokens(display),
                block_id=t.block_id or f"table:{t.table_index}",
                quality_flags=flags,
                metadata={"table_index": t.table_index, "summary": summary or ""},
            )
        )
        idx += 1
    # 2) Figures - caption + placeholder description.
    for f in extraction.figures:
        text_parts: list[str] = []
        if f.caption:
            text_parts.append(f"Caption: {f.caption}")
        text_parts.append(f"Figure detected on page {f.page_number}.")
        text = "\n".join(text_parts)
        block_type = "figure_caption" if f.caption else "figure_description"
        display, norm = normalize_block(text)
        flags = compute_quality_flags(
            text=display,
            block_type=block_type,
            ocr_confidence=document_ocr_confidence,
        )
        chunks.append(
            ChunkRecord(
                chunk_index=idx,
                page_number=f.page_number,
                block_type=block_type,
                text=display,
                normalized_text=norm,
                token_count=_estimate_tokens(display),
                block_id=f.block_id or f"figure:{f.figure_index}",
                quality_flags=flags,
                metadata={"figure_index": f.figure_index},
            )
        )
        idx += 1
    # 3) Narrative blocks grouped per page, packed by structure.
    by_page: dict[int, list[ExtractedBlock]] = {}
    for b in extraction.blocks:
        by_page.setdefault(b.page_number, []).append(b)
    for page_no in sorted(by_page):
        blocks = by_page[page_no]
        groups = _group_by_section(blocks)
        for group in groups:
            packed = _pack_group(group, target=target, maximum=maximum, minimum=minimum)
            for piece in packed:
                text = piece["text"]
                btype = piece["block_type"]
                display, norm = normalize_block(text)
                flags = compute_quality_flags(
                    text=display,
                    block_type=btype,
                    ocr_confidence=document_ocr_confidence,
                )
                chunks.append(
                    ChunkRecord(
                        chunk_index=idx,
                        page_number=page_no,
                        block_type=btype,
                        text=display,
                        normalized_text=norm,
                        token_count=_estimate_tokens(display),
                        block_id=piece.get("block_id"),
                        quality_flags=flags,
                        metadata={"section_heading": piece.get("section") or ""},
                    )
                )
                idx += 1
            # Optional overlap: only if the last piece is long narrative
            if overlap > 0 and packed and packed[-1]["block_type"] == "paragraph":
                tail = _tail_tokens(packed[-1]["text"], overlap)
                if tail and len(tail.split()) >= max(20, overlap // 2):
                    # Overlap is already represented by next-group adjacency in
                    # most legacy docs; we do not emit duplicate overlap chunks
                    # to avoid index bloat. This is intentional per spec note
                    # ("only for long narrative text") - left here for future tuning.
                    pass
    return chunks
 # ---------------- Helpers ----------------
 def _group_by_section(blocks: list[ExtractedBlock]) -> list[list[ExtractedBlock]]:
    groups: list[list[ExtractedBlock]] = []
    current: list[ExtractedBlock] = []
    for b in blocks:
        if b.block_type in ("title", "heading") and current:
            groups.append(current)
            current = [b]
        else:
            current.append(b)
    if current:
        groups.append(current)
    return groups
 def _pack_group(
    group: list[ExtractedBlock], *, target: int, maximum: int, minimum: int
 ) -> list[dict[str, Any]]:
    """Pack a section's blocks into chunks at most ``maximum`` tokens.
    Headings / titles attach to the next chunk as a section anchor.
    """
    if not group:
        return []
    section_heading = ""
    body_blocks: list[ExtractedBlock] = []
    for b in group:
        if b.block_type in ("title", "heading"):
            section_heading = (section_heading + " > " + b.text).strip(" >") if section_heading else b.text
        else:
            body_blocks.append(b)
    if not body_blocks:
        # Heading-only group: emit as a single ``heading`` chunk so the title is searchable.
        text = section_heading or group[0].text
        return [
            {
                "text": text,
                "block_type": "heading",
                "block_id": group[0].block_id,
                "section": section_heading,
            }
        ]
    out: list[dict[str, Any]] = []
    buffer: list[str] = []
    buffer_block_ids: list[str] = []
    buffer_block_type = "paragraph"
    buffer_tokens = 0
    def flush():
        nonlocal buffer, buffer_block_ids, buffer_block_type, buffer_tokens
        if not buffer:
            return
        text = "\n\n".join(buffer).strip()
        if not text:
            buffer = []
            buffer_block_ids = []
            buffer_tokens = 0
            return
        # Prepend section heading for context (kept short).
        if section_heading and len(section_heading) < 200:
            text = f"# {section_heading}\n\n{text}"
        out.append(
            {
                "text": text,
                "block_type": buffer_block_type,
                "block_id": buffer_block_ids[0] if buffer_block_ids else None,
                "section": section_heading,
            }
        )
        buffer = []
        buffer_block_ids = []
        buffer_tokens = 0
    for b in body_blocks:
        tokens = _estimate_tokens(b.text)
        if tokens >= maximum:
            # Hard split a giant block into sub-chunks of ~target tokens.
            flush()
            for sub in _split_long_text(b.text, target=target, maximum=maximum):
                out.append(
                    {
                        "text": sub,
                        "block_type": b.block_type if b.block_type != "list" else "list",
                        "block_id": b.block_id,
                        "section": section_heading,
                    }
                )
            continue
        if buffer_tokens + tokens > maximum and buffer_tokens >= minimum:
            flush()
        if not buffer:
            buffer_block_type = b.block_type if b.block_type != "list" else "list"
        buffer.append(b.text)
        if b.block_id:
            buffer_block_ids.append(b.block_id)
        buffer_tokens += tokens
        if buffer_tokens >= target:
            flush()
    flush()
    return out
 def _split_long_text(text: str, *, target: int, maximum: int) -> list[str]:
    words = text.split()
    if not words:
        return []
    pieces: list[str] = []
    step = target
    if step <= 0:
        step = 500
    i = 0
    while i < len(words):
        end = min(len(words), i + maximum)
        # Aim for ``target`` words but extend up to ``maximum`` to reach a sentence boundary.
        piece = " ".join(words[i : i + step])
        pieces.append(piece)
        i += step
        if end - i < target // 4 and end - i > 0:
            pieces[-1] = " ".join(words[i - step : end])
            break
    return pieces
 def _tail_tokens(text: str, n: int) -> str:
    words = text.split()
    if len(words) <= n:
        return text
    return " ".join(words[-n:])
 def _summarize_table(t: ExtractedTable) -> str:
    """Heuristic one-line summary for index recall."""
    md = t.markdown or ""
    first = next((line for line in md.splitlines() if line.startswith("|")), "")
    header_cells = [c.strip() for c in first.strip("|").split("|") if c.strip()]
    n_cols = len(header_cells)
    n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
    header_preview = ", ".join(header_cells[:6])
    return (
        f"Table on page {t.page_number}: {n_rows} rows x {n_cols} cols. "
        f"Columns: {header_preview}." if header_cells else
        f"Table on page {t.page_number}."
    )
--- a/app/ingestion/docling_extractor.py
+++ b/app/ingestion/docling_extractor.py
@@ -0,0 +1,384 @@
 """Docling structured extraction.
 Docling produces a hierarchical document model with reading order, layout, tables
 and figures. We export both Markdown and a JSON representation, then walk the
 JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
 figure caption) for downstream chunking.
 The extractor is intentionally defensive: Docling's exact Python API has
 shifted across releases. We probe for the safest exporter methods and fall
 back to ``str(document)`` only as a last resort.
 """
 from __future__ import annotations
 import json
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
@dataclass
 class ExtractedBlock:
    page_number: int
    block_type: str
    text: str
    block_id: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)
@dataclass
 class ExtractedTable:
    page_number: int
    table_index: int
    markdown: str
    csv_text: str | None = None
    json_data: dict[str, Any] | None = None
    block_id: str | None = None
@dataclass
 class ExtractedFigure:
    page_number: int
    figure_index: int
    caption: str | None
    block_id: str | None = None
    image_bytes: bytes | None = None
    image_ext: str = "png"
@dataclass
 class ExtractedPage:
    page_number: int
    text: str
    has_tables: bool = False
    has_figures: bool = False
    has_handwriting: bool = False
    ocr_confidence: float | None = None
@dataclass
 class ExtractionResult:
    markdown: str
    json_payload: dict[str, Any]
    blocks: list[ExtractedBlock]
    tables: list[ExtractedTable]
    figures: list[ExtractedFigure]
    pages: list[ExtractedPage]
 def extract(pdf_path: Path) -> ExtractionResult:
    """Run Docling on ``pdf_path`` and return a normalized result."""
    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.document_converter import DocumentConverter, PdfFormatOption
    pipeline_options = PdfPipelineOptions()
    # We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
    pipeline_options.do_ocr = settings.docling_ocr_enabled
    pipeline_options.do_table_structure = True
    try:
        pipeline_options.table_structure_options.do_cell_matching = True
    except Exception:  # noqa: BLE001 - older docling versions lack this
        pass
    try:
        pipeline_options.generate_page_images = True
    except Exception:  # noqa: BLE001
        pass
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    logger.info("docling.start", input=str(pdf_path))
    conv = converter.convert(str(pdf_path))
    doc = conv.document
    markdown = _safe_export_markdown(doc)
    json_payload = _safe_export_dict(doc)
    blocks = _walk_blocks(json_payload)
    tables = _walk_tables(doc, json_payload)
    figures = _walk_figures(doc, json_payload)
    pages = _walk_pages(json_payload, blocks, tables, figures)
    logger.info(
        "docling.done",
        pages=len(pages),
        blocks=len(blocks),
        tables=len(tables),
        figures=len(figures),
    )
    return ExtractionResult(
        markdown=markdown,
        json_payload=json_payload,
        blocks=blocks,
        tables=tables,
        figures=figures,
        pages=pages,
    )
 # ---------------- Internal helpers ----------------
 def _safe_export_markdown(doc: Any) -> str:
    for attr in ("export_to_markdown", "to_markdown"):
        fn = getattr(doc, attr, None)
        if callable(fn):
            try:
                return fn()
            except Exception:  # noqa: BLE001
                continue
    return str(doc)
 def _safe_export_dict(doc: Any) -> dict[str, Any]:
    for attr in ("export_to_dict", "model_dump", "dict"):
        fn = getattr(doc, attr, None)
        if callable(fn):
            try:
                data = fn()
                if isinstance(data, dict):
                    return data
            except Exception:  # noqa: BLE001
                continue
    # Last resort: serialize via JSON round-trip
    try:
        return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
    except Exception:  # noqa: BLE001
        return {}
 _DOCLING_LABEL_TO_BLOCK = {
    "title": "title",
    "section_header": "heading",
    "section-header": "heading",
    "subtitle": "heading",
    "page_header": "heading",
    "header": "heading",
    "list_item": "list",
    "list-item": "list",
    "list": "list",
    "paragraph": "paragraph",
    "text": "paragraph",
    "caption": "figure_caption",
    "figure": "figure_caption",
    "table": "table",
    "footnote": "paragraph",
 }
 def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
    """Flatten Docling's text items into ordered blocks per page."""
    blocks: list[ExtractedBlock] = []
    items = (
        payload.get("texts")
        or payload.get("text_items")
        or payload.get("body", {}).get("text_items", [])
        or []
    )
    if not isinstance(items, list):
        return blocks
    for item in items:
        if not isinstance(item, dict):
            continue
        label = (item.get("label") or item.get("category") or "paragraph").lower()
        text = (item.get("text") or "").strip()
        if not text:
            continue
        block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
        page = _page_of(item)
        blocks.append(
            ExtractedBlock(
                page_number=page,
                block_type=block_type,
                text=text,
                block_id=item.get("self_ref") or item.get("id"),
                extra={"label": label},
            )
        )
    return blocks
 def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
    tables: list[ExtractedTable] = []
    raw_tables = payload.get("tables") or []
    for idx, t in enumerate(raw_tables):
        if not isinstance(t, dict):
            continue
        page = _page_of(t)
        md = _table_markdown(doc, t, idx)
        csv_text = _table_csv(t)
        tables.append(
            ExtractedTable(
                page_number=page,
                table_index=idx,
                markdown=md,
                csv_text=csv_text,
                json_data=t,
                block_id=t.get("self_ref") or t.get("id"),
            )
        )
    return tables
 def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
    figures: list[ExtractedFigure] = []
    raw_figures = payload.get("pictures") or payload.get("figures") or []
    for idx, f in enumerate(raw_figures):
        if not isinstance(f, dict):
            continue
        page = _page_of(f)
        caption = (f.get("caption") or "").strip() or None
        figures.append(
            ExtractedFigure(
                page_number=page,
                figure_index=idx,
                caption=caption,
                block_id=f.get("self_ref") or f.get("id"),
            )
        )
    return figures
 def _walk_pages(
    payload: dict[str, Any],
    blocks: list[ExtractedBlock],
    tables: list[ExtractedTable],
    figures: list[ExtractedFigure],
 ) -> list[ExtractedPage]:
    pages_meta = payload.get("pages") or {}
    page_numbers: set[int] = set()
    if isinstance(pages_meta, dict):
        for k in pages_meta.keys():
            try:
                page_numbers.add(int(k))
            except (ValueError, TypeError):
                continue
    elif isinstance(pages_meta, list):
        for p in pages_meta:
            if isinstance(p, dict):
                pn = p.get("page_no") or p.get("page") or p.get("number")
                if isinstance(pn, int):
                    page_numbers.add(pn)
    for b in blocks:
        page_numbers.add(b.page_number)
    for t in tables:
        page_numbers.add(t.page_number)
    for f in figures:
        page_numbers.add(f.page_number)
    page_numbers.discard(0)
    if not page_numbers:
        page_numbers = {1}
    by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
    for b in blocks:
        by_page_text.setdefault(b.page_number, []).append(b.text)
    has_tables_set = {t.page_number for t in tables}
    has_figures_set = {f.page_number for f in figures}
    return [
        ExtractedPage(
            page_number=pn,
            text="\n\n".join(by_page_text.get(pn, [])),
            has_tables=pn in has_tables_set,
            has_figures=pn in has_figures_set,
        )
        for pn in sorted(page_numbers)
    ]
 def _page_of(item: dict[str, Any]) -> int:
    prov = item.get("prov") or item.get("provenance")
    if isinstance(prov, list) and prov:
        first = prov[0]
        if isinstance(first, dict):
            pn = first.get("page_no") or first.get("page") or first.get("page_number")
            if isinstance(pn, int):
                return pn
    pn = item.get("page_no") or item.get("page") or item.get("page_number")
    if isinstance(pn, int):
        return pn
    return 1
 def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
    # Try Docling's own export first (per-table).
    try:
        export = getattr(doc, "export_table_to_markdown", None)
        if callable(export):
            return export(idx)
    except Exception:  # noqa: BLE001
        pass
    grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
    if isinstance(grid, list) and grid and isinstance(grid[0], list):
        return _grid_to_markdown(grid)
    cells = raw.get("table_cells")
    if isinstance(cells, list):
        return _cells_to_markdown(cells)
    return ""
 def _grid_to_markdown(grid: list[list[Any]]) -> str:
    if not grid:
        return ""
    def _cell(c: Any) -> str:
        if isinstance(c, dict):
            return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
        return str(c).replace("|", "\\|").strip()
    header = grid[0]
    body = grid[1:] if len(grid) > 1 else []
    cols = len(header)
    out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
    out.append("| " + " | ".join(["---"] * cols) + " |")
    for row in body:
        cells = [_cell(c) for c in row]
        if len(cells) < cols:
            cells += [""] * (cols - len(cells))
        out.append("| " + " | ".join(cells[:cols]) + " |")
    return "\n".join(out)
 def _cells_to_markdown(cells: list[Any]) -> str:
    rows: dict[int, dict[int, str]] = {}
    for c in cells:
        if not isinstance(c, dict):
            continue
        r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
        col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
        rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
    if not rows:
        return ""
    max_col = max((max(r.keys()) for r in rows.values()), default=0)
    grid = []
    for r_idx in sorted(rows):
        row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
        grid.append(row)
    return _grid_to_markdown(grid)
 def _table_csv(raw: dict[str, Any]) -> str | None:
    grid = raw.get("data") or raw.get("grid")
    if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
        return None
    import csv
    import io
    buf = io.StringIO()
    writer = csv.writer(buf)
    for row in grid:
        writer.writerow([
            (c.get("text") if isinstance(c, dict) else c) or "" for c in row
        ])
    return buf.getvalue()
--- a/app/ingestion/figure_processor.py
+++ b/app/ingestion/figure_processor.py
@@ -0,0 +1,78 @@
 """Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
 from __future__ import annotations
 import uuid
 from sqlalchemy import select
 from app.db.models import ArtifactType, DocumentArtifact, Figure
 from app.ingestion.docling_extractor import ExtractedFigure
 from app.logging_config import get_logger
 from app.storage.local_paths import key_figure_crop
 from app.storage.minio_client import MinioStorage
 logger = get_logger(__name__)
 def persist_figures(
    db,
    storage: MinioStorage,
    document_id: uuid.UUID,
    figures: list[ExtractedFigure],
    page_id_by_number: dict[int, uuid.UUID],
 ) -> int:
    count = 0
    for f in figures:
        existing = db.execute(
            select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
        ).scalar_one_or_none()
        if existing is None:
            existing = Figure(
                document_id=document_id,
                page_id=page_id_by_number.get(f.page_number),
                page_number=f.page_number,
                figure_index=f.figure_index,
            )
            db.add(existing)
        existing.caption = f.caption
        existing.description = (
            f"Figure detected on page {f.page_number}." if not f.caption else
            f"Figure on page {f.page_number}. Caption: {f.caption}"
        )
        if f.image_bytes:
            key = key_figure_crop(document_id, f.page_number, f.figure_index)
            storage.put_bytes(
                bucket=storage.derived_bucket,
                key=key,
                data=f.image_bytes,
                content_type=f"image/{f.image_ext}",
            )
            existing.storage_bucket = storage.derived_bucket
            existing.storage_key = key
            _ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
        count += 1
    return count
 def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
    existing = db.execute(
        select(DocumentArtifact).where(
            DocumentArtifact.document_id == document_id,
            DocumentArtifact.storage_key == key,
        )
    ).scalar_one_or_none()
    if existing:
        return
    db.add(
        DocumentArtifact(
            document_id=document_id,
            artifact_type=artifact_type,
            storage_bucket=bucket,
            storage_key=key,
            page_number=page,
        )
    )
--- a/app/ingestion/normalizer.py
+++ b/app/ingestion/normalizer.py
@@ -0,0 +1,12 @@
 """Block-level normalization wrappers around utils.text_cleaning."""
 from __future__ import annotations
 from app.utils.text_cleaning import clean_ocr_text, normalize_for_search
 def normalize_block(text: str) -> tuple[str, str]:
    """Return ``(display_text, normalized_text)``."""
    display = clean_ocr_text(text)
    norm = normalize_for_search(display)
    return display, norm
--- a/app/ingestion/ocr.py
+++ b/app/ingestion/ocr.py
@@ -0,0 +1,87 @@
 """OCRmyPDF integration with Tesseract.
 We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
 disabled by config), we skip OCR and use the original PDF. On failure, the
 caller is expected to mark the document ``OCR_FAILED`` and continue without it.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 import ocrmypdf
 from app.config import settings
 from app.logging_config import get_logger
 from app.utils.pdf import has_searchable_text
 logger = get_logger(__name__)
@dataclass
 class OcrResult:
    output_path: Path
    skipped: bool
    reason: str
    languages: str
 def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
    """Run OCRmyPDF.
    - If ``OCR_ENABLED`` is false: copy the input as the output and skip.
    - If the input already has searchable text: skip OCR but still produce
      ``output_pdf`` (a hard-link / copy to keep downstream code simple).
    - On unexpected exceptions: re-raise (caller handles status update).
    """
    langs = languages or settings.ocr_languages
    if not settings.ocr_enabled:
        return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
    if has_searchable_text(input_pdf):
        return _skip(input_pdf, output_pdf, langs, "already_searchable")
    output_pdf.parent.mkdir(parents=True, exist_ok=True)
    logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
    try:
        ocrmypdf.ocr(
            input_file=str(input_pdf),
            output_file=str(output_pdf),
            language=langs,
            skip_text=False,
            redo_ocr=False,
            force_ocr=False,
            deskew=settings.ocr_deskew,
            clean=settings.ocr_clean,
            optimize=settings.ocr_optimize,
            progress_bar=False,
            jobs=1,
            output_type="pdf",
            # tolerate already-OCR pages where present
            skip_big=200.0,
        )
    except ocrmypdf.exceptions.PriorOcrFoundError:
        logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
        return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
    except ocrmypdf.exceptions.DigitalSignatureError:
        logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
        return _skip(input_pdf, output_pdf, langs, "digitally_signed")
    except ocrmypdf.exceptions.EncryptedPdfError as exc:
        logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
        raise
    except ocrmypdf.exceptions.MissingDependencyError as exc:
        logger.error("ocr.missing_dependency", error=str(exc))
        raise
    logger.info("ocr.done", output=str(output_pdf))
    return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
 def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
    output_pdf.parent.mkdir(parents=True, exist_ok=True)
    if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
        output_pdf.write_bytes(input_pdf.read_bytes())
    return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)
--- a/app/ingestion/pipeline.py
+++ b/app/ingestion/pipeline.py
@@ -0,0 +1,384 @@
 """Per-document end-to-end pipeline: OCR -> Docling -> chunk -> persist -> index.
 Called by the Celery worker. Idempotent: re-running on the same document deletes
 existing chunks for that document and re-creates them, then re-indexes in
 OpenSearch and Qdrant.
 """
 from __future__ import annotations
 import json
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from sqlalchemy import delete, select
 from app.config import settings
 from app.db.models import (
    ArtifactType,
    Chunk,
    Document,
    DocumentArtifact,
    DocumentStatus,
    Page,
    ProcessingEvent,
 )
 from app.db.session import session_scope
 from app.indexing import opensearch_client, qdrant_client
 from app.indexing.embeddings import get_embedder
 from app.ingestion.chunker import ChunkRecord, chunk_extraction
 from app.ingestion.docling_extractor import ExtractionResult, extract
 from app.ingestion.figure_processor import persist_figures
 from app.ingestion.ocr import run_ocr
 from app.ingestion.table_processor import persist_tables
 from app.logging_config import get_logger
 from app.storage.local_paths import (
    key_docling_json,
    key_markdown,
    key_ocr_pdf,
    work_dir_for,
 )
 from app.storage.minio_client import get_storage
 from app.utils.language import detect_language
 logger = get_logger(__name__)
 def process_document_id(document_id: uuid.UUID, run_id: uuid.UUID | None = None) -> dict[str, Any]:
    """Top-level entry called by the Celery task. Wraps the pipeline in
    error handling so the task always either succeeds or marks the document FAILED.
    """
    storage = get_storage()
    storage.ensure_buckets()
    with session_scope() as db:
        doc = db.get(Document, document_id)
        if doc is None:
            logger.warning("pipeline.document_missing", document_id=str(document_id))
            return {"status": "missing"}
        source_path = Path(doc.source_path)
        sha = doc.sha256
        original_artifact = db.execute(
            select(DocumentArtifact).where(
                DocumentArtifact.document_id == doc.id,
                DocumentArtifact.artifact_type == ArtifactType.ORIGINAL_PDF,
            )
        ).scalar_one_or_none()
    work_dir = work_dir_for(document_id)
    local_pdf = work_dir / f"{sha}.pdf"
    if not local_pdf.exists():
        if source_path.exists():
            local_pdf.write_bytes(source_path.read_bytes())
        elif original_artifact:
            storage.get_to_path(original_artifact.storage_bucket, original_artifact.storage_key, local_pdf)
        else:
            return _fail(document_id, run_id, "OCR_FAILED", "Original PDF not available locally or in MinIO")
    # ---------------- OCR ----------------
    ocr_pdf = work_dir / "ocr.pdf"
    try:
        _emit_event(document_id, run_id, DocumentStatus.OCR_STARTED, "OCR started")
        ocr_result = run_ocr(local_pdf, ocr_pdf, languages=settings.ocr_languages)
    except Exception as exc:  # noqa: BLE001
        logger.exception("pipeline.ocr_failed", document_id=str(document_id))
        return _fail(document_id, run_id, DocumentStatus.OCR_FAILED, f"OCR failed: {exc}")
    # Upload OCR PDF (even if we 'skipped' it - OCR PDF is the canonical input to Docling).
    ocr_key = key_ocr_pdf(document_id)
    storage.put_file(
        bucket=storage.derived_bucket,
        key=ocr_key,
        path=ocr_result.output_path,
        content_type="application/pdf",
    )
    with session_scope() as db:
        _ensure_artifact(db, document_id, ArtifactType.OCR_PDF, storage.derived_bucket, ocr_key)
        doc = db.get(Document, document_id)
        if doc is not None:
            doc.status = DocumentStatus.OCR_COMPLETED
        db.add(
            ProcessingEvent(
                run_id=run_id,
                document_id=document_id,
                stage=DocumentStatus.OCR_COMPLETED,
                level="INFO",
                message=f"OCR finished ({ocr_result.reason})",
                data={"skipped": ocr_result.skipped, "languages": ocr_result.languages},
            )
        )
    # ---------------- Docling ----------------
    try:
        _emit_event(document_id, run_id, DocumentStatus.EXTRACTION_STARTED, "Docling extraction started")
        extraction = extract(ocr_result.output_path)
    except Exception as exc:  # noqa: BLE001
        logger.exception("pipeline.docling_failed", document_id=str(document_id))
        return _fail(document_id, run_id, DocumentStatus.EXTRACTION_FAILED, f"Docling failed: {exc}")
    # Persist Markdown + JSON to MinIO.
    md_key = key_markdown(document_id)
    json_key = key_docling_json(document_id)
    storage.put_bytes(
        bucket=storage.derived_bucket,
        key=md_key,
        data=extraction.markdown.encode("utf-8"),
        content_type="text/markdown",
    )
    storage.put_bytes(
        bucket=storage.derived_bucket,
        key=json_key,
        data=json.dumps(extraction.json_payload, ensure_ascii=False).encode("utf-8"),
        content_type="application/json",
    )
    # ---------------- Persist pages, chunks, tables, figures ----------------
    chunk_records = chunk_extraction(extraction)
    sample_text = "\n".join(p.text for p in extraction.pages[:3] if p.text)
    lang = detect_language(sample_text)
    with session_scope() as db:
        _ensure_artifact(db, document_id, ArtifactType.MARKDOWN, storage.derived_bucket, md_key)
        _ensure_artifact(db, document_id, ArtifactType.DOCLING_JSON, storage.derived_bucket, json_key)
        doc = db.get(Document, document_id)
        if doc is None:
            return {"status": "missing"}
        doc.status = DocumentStatus.EXTRACTION_COMPLETED
        if lang and not doc.language_hint:
            doc.language_hint = lang
        page_id_by_number = _upsert_pages(db, document_id, extraction)
        persist_tables(db, storage, document_id, extraction.tables, page_id_by_number)
        persist_figures(db, storage, document_id, extraction.figures, page_id_by_number)
        # Replace chunks idempotently: drop all and re-insert.
        db.execute(delete(Chunk).where(Chunk.document_id == document_id))
        for cr in chunk_records:
            db.add(_to_chunk_row(document_id, page_id_by_number, cr))
        doc.status = DocumentStatus.CHUNKING_COMPLETED
        db.add(
            ProcessingEvent(
                run_id=run_id,
                document_id=document_id,
                stage=DocumentStatus.CHUNKING_COMPLETED,
                level="INFO",
                message="Chunking complete",
                data={"chunks": len(chunk_records)},
            )
        )
    # ---------------- Indexing (OpenSearch + Qdrant) ----------------
    try:
        opensearch_client.ensure_index()
        qdrant_client.ensure_collection()
        opensearch_client.delete_by_document(str(document_id))
        qdrant_client.delete_by_document(str(document_id))
        os_docs, qdrant_points = _build_index_payloads(document_id, chunk_records, extraction, lang)
        if os_docs:
            opensearch_client.index_chunks(os_docs)
        if qdrant_points:
            embedder = get_embedder()
            texts_to_embed = [text for _, text, _ in qdrant_points]
            vectors = embedder.encode(texts_to_embed)
            triples = [
                (chunk_id, vec, payload)
                for (chunk_id, _text, payload), vec in zip(qdrant_points, vectors, strict=True)
            ]
            qdrant_client.upsert_chunks(triples)
    except Exception as exc:  # noqa: BLE001
        logger.exception("pipeline.indexing_failed", document_id=str(document_id))
        return _fail(document_id, run_id, DocumentStatus.FAILED, f"Indexing failed: {exc}")
    with session_scope() as db:
        doc = db.get(Document, document_id)
        if doc is not None:
            doc.status = DocumentStatus.INDEXING_COMPLETED
            doc.error_message = None
        db.add(
            ProcessingEvent(
                run_id=run_id,
                document_id=document_id,
                stage=DocumentStatus.INDEXING_COMPLETED,
                level="INFO",
                message="Indexing complete",
                data={"chunks": len(chunk_records)},
            )
        )
    return {"status": DocumentStatus.INDEXING_COMPLETED, "chunks": len(chunk_records)}
 # ---------------- helpers ----------------
 def _to_chunk_row(
    document_id: uuid.UUID, page_id_by_number: dict[int, uuid.UUID], cr: ChunkRecord
 ) -> Chunk:
    return Chunk(
        document_id=document_id,
        page_id=page_id_by_number.get(cr.page_number),
        page_number=cr.page_number,
        block_id=cr.block_id,
        chunk_index=cr.chunk_index,
        block_type=cr.block_type,
        text=cr.text,
        normalized_text=cr.normalized_text,
        token_count=cr.token_count,
        ocr_confidence=None,
        quality_flags=cr.quality_flags,
        chunk_metadata=cr.metadata,
    )
 def _upsert_pages(db, document_id: uuid.UUID, extraction: ExtractionResult) -> dict[int, uuid.UUID]:
    existing = {
        p.page_number: p
        for p in db.execute(select(Page).where(Page.document_id == document_id)).scalars()
    }
    out: dict[int, uuid.UUID] = {}
    for ep in extraction.pages:
        page = existing.get(ep.page_number)
        if page is None:
            page = Page(
                document_id=document_id,
                page_number=ep.page_number,
                text=ep.text,
                ocr_confidence=ep.ocr_confidence,
                has_tables=ep.has_tables,
                has_figures=ep.has_figures,
                has_handwriting=ep.has_handwriting,
            )
            db.add(page)
            db.flush()
        else:
            page.text = ep.text
            page.has_tables = ep.has_tables
            page.has_figures = ep.has_figures
            page.has_handwriting = ep.has_handwriting
        out[ep.page_number] = page.id
    return out
 def _build_index_payloads(
    document_id: uuid.UUID,
    chunks: list[ChunkRecord],
    extraction: ExtractionResult,
    language_hint: str | None,
 ) -> tuple[list[dict[str, Any]], list[tuple[str, str, dict[str, Any]]]]:
    with session_scope() as db:
        doc = db.get(Document, document_id)
        if doc is None:
            return [], []
        original_file_name = doc.original_file_name
        source_path = doc.source_path
        chunk_rows = (
            db.execute(select(Chunk).where(Chunk.document_id == document_id))
            .scalars()
            .all()
        )
    os_docs: list[dict[str, Any]] = []
    qdrant: list[tuple[str, str, dict[str, Any]]] = []
    for row in chunk_rows:
        chunk_id = str(row.id)
        text = row.text or ""
        os_docs.append(
            {
                "chunk_id": chunk_id,
                "document_id": str(document_id),
                "source_path": source_path,
                "original_file_name": original_file_name,
                "page_number": row.page_number,
                "block_type": row.block_type,
                "block_id": row.block_id,
                "text": text,
                "normalized_text": row.normalized_text,
                "ocr_confidence": row.ocr_confidence,
                "language_hint": language_hint,
                "metadata": row.chunk_metadata or {},
                "quality_flags": row.quality_flags or {},
                "created_at": (row.created_at or datetime.now(tz=timezone.utc)).isoformat(),
            }
        )
        text_preview = text[:512]
        qdrant.append(
            (
                chunk_id,
                text,
                {
                    "document_id": str(document_id),
                    "source_path": source_path,
                    "original_file_name": original_file_name,
                    "page_number": row.page_number,
                    "block_type": row.block_type,
                    "block_id": row.block_id,
                    "text_preview": text_preview,
                    "ocr_confidence": row.ocr_confidence,
                    "quality_flags": row.quality_flags or {},
                    "metadata": row.chunk_metadata or {},
                },
            )
        )
    return os_docs, qdrant
 def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str) -> None:
    existing = db.execute(
        select(DocumentArtifact).where(
            DocumentArtifact.document_id == document_id,
            DocumentArtifact.storage_key == key,
        )
    ).scalar_one_or_none()
    if existing:
        return
    db.add(
        DocumentArtifact(
            document_id=document_id,
            artifact_type=artifact_type,
            storage_bucket=bucket,
            storage_key=key,
        )
    )
 def _emit_event(document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str) -> None:
    with session_scope() as db:
        db.add(
            ProcessingEvent(
                run_id=run_id,
                document_id=document_id,
                stage=stage,
                level="INFO",
                message=message,
                data={},
            )
        )
 def _fail(
    document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str
 ) -> dict[str, Any]:
    with session_scope() as db:
        doc = db.get(Document, document_id)
        if doc is not None:
            doc.status = stage
            doc.error_message = message[:2000]
        db.add(
            ProcessingEvent(
                run_id=run_id,
                document_id=document_id,
                stage=stage,
                level="ERROR",
                message=message,
                data={},
            )
        )
    logger.error("pipeline.failed", document_id=str(document_id), stage=stage, message=message)
    return {"status": stage, "error": message}
--- a/app/ingestion/quality.py
+++ b/app/ingestion/quality.py
@@ -0,0 +1,41 @@
 """Quality flag computation for chunks."""
 from __future__ import annotations
 from typing import Any
 from app.utils.text_cleaning import looks_garbled
 LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
 SHORT_TEXT_THRESHOLD = 24
 def compute_quality_flags(
    *,
    text: str,
    block_type: str,
    ocr_confidence: float | None,
    has_handwriting: bool = False,
 ) -> dict[str, Any]:
    flags: dict[str, Any] = {
        "low_ocr_confidence": False,
        "very_short_text": False,
        "possible_garbled_text": False,
        "table_detected": block_type == "table",
        "figure_detected": block_type in ("figure_caption", "figure_description"),
        "handwriting_detected": has_handwriting or block_type == "handwriting",
        "needs_manual_review": False,
    }
    if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
        flags["low_ocr_confidence"] = True
    if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
        flags["very_short_text"] = True
    if looks_garbled(text):
        flags["possible_garbled_text"] = True
    if (
        flags["low_ocr_confidence"]
        or flags["possible_garbled_text"]
        or flags["handwriting_detected"]
    ):
        flags["needs_manual_review"] = True
    return flags
--- a/app/ingestion/scanner.py
+++ b/app/ingestion/scanner.py
@@ -0,0 +1,184 @@
 """Folder scanner: discovers PDFs, deduplicates by SHA256, persists discovery rows.
 The scanner does NOT trigger OCR or extraction. It only:
 - enumerates PDF files,
 - hashes each file,
 - creates / reuses a ``Document`` row,
 - uploads the original PDF to MinIO,
 - emits ``DISCOVERED`` / ``STORED_ORIGINAL`` events.
 Heavy work (OCR, Docling, indexing) is performed by the Celery worker pipeline.
 """
 from __future__ import annotations
 import os
 import uuid
 from collections.abc import Iterator
 from dataclasses import dataclass
 from pathlib import Path
 from sqlalchemy import select
 from app.db.models import (
    ArtifactType,
    Document,
    DocumentArtifact,
    DocumentStatus,
    ProcessingEvent,
 )
 from app.db.session import session_scope
 from app.logging_config import get_logger
 from app.storage.local_paths import key_original_pdf
 from app.storage.minio_client import get_storage
 from app.utils.hashing import sha256_file
 from app.utils.pdf import is_pdf
 logger = get_logger(__name__)
@dataclass
 class DiscoveryRecord:
    path: Path
    sha256: str | None
    document_id: uuid.UUID | None
    duplicate: bool
    invalid: bool = False
 def iter_pdf_files(root: Path, recursive: bool = True) -> Iterator[Path]:
    if root.is_file():
        if is_pdf(root):
            yield root
        return
    if recursive:
        for dirpath, _dirnames, filenames in os.walk(root):
            for name in filenames:
                p = Path(dirpath) / name
                if is_pdf(p):
                    yield p
    else:
        for p in root.iterdir():
            if is_pdf(p):
                yield p
 def discover_documents(
    root: Path, recursive: bool = True, force: bool = False
 ) -> Iterator[DiscoveryRecord]:
    storage = get_storage()
    storage.ensure_buckets()
    for path in iter_pdf_files(root, recursive=recursive):
        try:
            stat = path.stat()
            sha = sha256_file(path)
        except Exception as exc:  # noqa: BLE001
            logger.warning("scan.invalid_file", path=str(path), error=str(exc))
            yield DiscoveryRecord(path=path, sha256=None, document_id=None, duplicate=False, invalid=True)
            continue
        with session_scope() as db:
            existing = db.execute(
                select(Document).where(Document.sha256 == sha)
            ).scalar_one_or_none()
            if existing and not force:
                logger.debug("scan.duplicate", path=str(path), sha256=sha, document_id=str(existing.id))
                yield DiscoveryRecord(path=path, sha256=sha, document_id=existing.id, duplicate=True)
                continue
            doc = existing or Document(
                id=uuid.uuid4(),
                source_path=str(path),
                original_file_name=path.name,
                sha256=sha,
                file_size_bytes=stat.st_size,
                mime_type="application/pdf",
                status=DocumentStatus.DISCOVERED,
            )
            if not existing:
                db.add(doc)
                db.flush()
                db.add(
                    ProcessingEvent(
                        document_id=doc.id,
                        stage=DocumentStatus.DISCOVERED,
                        level="INFO",
                        message="Document discovered",
                        data={"sha256": sha, "size": stat.st_size, "path": str(path)},
                    )
                )
            # Upload original (idempotent) and record artifact if missing.
            key = key_original_pdf(doc.id, sha)
            try:
                if not storage.exists(storage.originals_bucket, key):
                    storage.put_file(
                        bucket=storage.originals_bucket,
                        key=key,
                        path=path,
                        content_type="application/pdf",
                        metadata={"sha256": sha, "original-name": path.name[:255]},
                    )
                _ensure_artifact(
                    db,
                    doc.id,
                    ArtifactType.ORIGINAL_PDF,
                    storage.originals_bucket,
                    key,
                    sha,
                )
                if doc.status == DocumentStatus.DISCOVERED:
                    doc.status = DocumentStatus.STORED_ORIGINAL
                    db.add(
                        ProcessingEvent(
                            document_id=doc.id,
                            stage=DocumentStatus.STORED_ORIGINAL,
                            level="INFO",
                            message="Original stored to MinIO",
                            data={"bucket": storage.originals_bucket, "key": key},
                        )
                    )
            except Exception as exc:  # noqa: BLE001
                logger.error("scan.store_failed", path=str(path), error=str(exc))
                doc.status = DocumentStatus.FAILED
                doc.error_message = f"store_original: {exc}"
                db.add(
                    ProcessingEvent(
                        document_id=doc.id,
                        stage="STORE_FAILED",
                        level="ERROR",
                        message=str(exc),
                        data={"path": str(path)},
                    )
                )
                yield DiscoveryRecord(path=path, sha256=sha, document_id=None, duplicate=False, invalid=True)
                continue
            yield DiscoveryRecord(
                path=path, sha256=sha, document_id=doc.id, duplicate=bool(existing)
            )
 def _ensure_artifact(
    db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
 ) -> None:
    existing = db.execute(
        select(DocumentArtifact).where(
            DocumentArtifact.document_id == document_id,
            DocumentArtifact.artifact_type == artifact_type,
            DocumentArtifact.storage_key == key,
        )
    ).scalar_one_or_none()
    if existing:
        return
    db.add(
        DocumentArtifact(
            document_id=document_id,
            artifact_type=artifact_type,
            storage_bucket=bucket,
            storage_key=key,
            checksum=checksum,
        )
    )
--- a/app/ingestion/table_processor.py
+++ b/app/ingestion/table_processor.py
@@ -0,0 +1,84 @@
 """Persists Docling tables to PostgreSQL + MinIO."""
 from __future__ import annotations
 import json
 import uuid
 from sqlalchemy import select
 from app.db.models import ArtifactType, DocumentArtifact, Table
 from app.ingestion.docling_extractor import ExtractedTable
 from app.logging_config import get_logger
 from app.storage.local_paths import key_table_json
 from app.storage.minio_client import MinioStorage
 logger = get_logger(__name__)
 def persist_tables(
    db,
    storage: MinioStorage,
    document_id: uuid.UUID,
    tables: list[ExtractedTable],
    page_id_by_number: dict[int, uuid.UUID],
 ) -> int:
    count = 0
    for t in tables:
        existing = db.execute(
            select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
        ).scalar_one_or_none()
        if existing is None:
            existing = Table(
                document_id=document_id,
                page_id=page_id_by_number.get(t.page_number),
                page_number=t.page_number,
                table_index=t.table_index,
            )
            db.add(existing)
        existing.markdown = t.markdown or ""
        existing.csv_text = t.csv_text
        existing.json_data = t.json_data
        existing.summary = _summary(t)
        db.flush()
        # Persist json blob to MinIO for large/inspectable copies.
        if t.json_data:
            key = key_table_json(document_id, t.table_index)
            storage.put_bytes(
                bucket=storage.derived_bucket,
                key=key,
                data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
                content_type="application/json",
            )
            _ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
        count += 1
    return count
 def _summary(t: ExtractedTable) -> str:
    md = t.markdown or ""
    n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
    return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
 def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
    existing = db.execute(
        select(DocumentArtifact).where(
            DocumentArtifact.document_id == document_id,
            DocumentArtifact.storage_key == key,
        )
    ).scalar_one_or_none()
    if existing:
        return
    db.add(
        DocumentArtifact(
            document_id=document_id,
            artifact_type=artifact_type,
            storage_bucket=bucket,
            storage_key=key,
            page_number=page,
        )
    )
--- a/app/logging_config.py
+++ b/app/logging_config.py
@@ -0,0 +1,61 @@
 """Structured logging via structlog with stdlib bridge.
 All modules use ``get_logger(__name__)`` and emit key/value pairs.
 """
 from __future__ import annotations
 import logging
 import sys
 from typing import Any
 import structlog
 from app.config import settings
 def configure_logging() -> None:
    level = getattr(logging, settings.app_log_level.upper(), logging.INFO)
    timestamper = structlog.processors.TimeStamper(fmt="iso", utc=True)
    shared_processors: list[Any] = [
        structlog.contextvars.merge_contextvars,
        structlog.stdlib.add_log_level,
        structlog.stdlib.add_logger_name,
        timestamper,
        structlog.processors.StackInfoRenderer(),
        structlog.processors.format_exc_info,
    ]
    structlog.configure(
        processors=shared_processors
        + [structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
        logger_factory=structlog.stdlib.LoggerFactory(),
        wrapper_class=structlog.stdlib.BoundLogger,
        cache_logger_on_first_use=True,
    )
    formatter = structlog.stdlib.ProcessorFormatter(
        foreign_pre_chain=shared_processors,
        processors=[
            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
            structlog.processors.JSONRenderer(),
        ],
    )
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(formatter)
    root = logging.getLogger()
    root.handlers.clear()
    root.addHandler(handler)
    root.setLevel(level)
    # Quiet down noisy libs
    for noisy in ("urllib3", "botocore", "s3transfer", "elasticsearch", "opensearch", "httpx"):
        logging.getLogger(noisy).setLevel(logging.WARNING)
 def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
    return structlog.get_logger(name)
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,52 @@
 """FastAPI entrypoint."""
 from __future__ import annotations
 from contextlib import asynccontextmanager
 from typing import AsyncIterator
 from fastapi import FastAPI
 from app import __version__
 from app.api import routes_health, routes_ingestion, routes_search
 from app.config import settings
 from app.logging_config import configure_logging, get_logger
 configure_logging()
 logger = get_logger(__name__)
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator[None]:
    logger.info("api.startup", version=__version__, prefix=settings.app_api_prefix)
    # Best-effort bootstrap of MinIO buckets - non-fatal if it fails (health will reflect).
    try:
        from app.storage.minio_client import get_storage
        get_storage().ensure_buckets()
    except Exception as exc:  # noqa: BLE001
        logger.warning("api.startup.minio_bootstrap_failed", error=str(exc))
    yield
    logger.info("api.shutdown")
 app = FastAPI(
    title="LegacyHUB",
    description="Hybrid lexical + semantic search over legacy PDF archives",
    version=__version__,
    lifespan=lifespan,
 )
 app.include_router(routes_health.router, prefix=settings.app_api_prefix)
 app.include_router(routes_ingestion.router, prefix=settings.app_api_prefix)
 app.include_router(routes_search.router, prefix=settings.app_api_prefix)
@app.get("/")
 def root() -> dict[str, str]:
    return {
        "service": "LegacyHUB",
        "version": __version__,
        "api": settings.app_api_prefix,
        "docs": "/docs",
    }
--- a/app/storage/init.py
+++ b/app/storage/init.py
@@ -0,0 +1,3 @@
 from app.storage.minio_client import MinioStorage, get_storage
 __all__ = ["MinioStorage", "get_storage"]
--- a/app/storage/local_paths.py
+++ b/app/storage/local_paths.py
@@ -0,0 +1,42 @@
 """Storage key conventions for MinIO and local working paths."""
 from __future__ import annotations
 import uuid
 from pathlib import Path
 from app.config import settings
 def work_dir_for(document_id: uuid.UUID | str) -> Path:
    p = Path(settings.app_work_dir) / str(document_id)
    p.mkdir(parents=True, exist_ok=True)
    return p
 def key_original_pdf(document_id: uuid.UUID | str, sha256: str) -> str:
    return f"docs/{document_id}/original/{sha256}.pdf"
 def key_ocr_pdf(document_id: uuid.UUID | str) -> str:
    return f"docs/{document_id}/ocr/ocr.pdf"
 def key_docling_json(document_id: uuid.UUID | str) -> str:
    return f"docs/{document_id}/docling/document.json"
 def key_markdown(document_id: uuid.UUID | str) -> str:
    return f"docs/{document_id}/docling/document.md"
 def key_page_image(document_id: uuid.UUID | str, page_number: int) -> str:
    return f"docs/{document_id}/pages/p{page_number:05d}.png"
 def key_figure_crop(document_id: uuid.UUID | str, page_number: int, figure_index: int) -> str:
    return f"docs/{document_id}/figures/p{page_number:05d}_f{figure_index:03d}.png"
 def key_table_json(document_id: uuid.UUID | str, table_index: int) -> str:
    return f"docs/{document_id}/tables/t{table_index:04d}.json"
--- a/app/storage/minio_client.py
+++ b/app/storage/minio_client.py
@@ -0,0 +1,110 @@
 """Thin wrapper around the MinIO Python SDK with bucket bootstrap and retries."""
 from __future__ import annotations
 import io
 from functools import lru_cache
 from pathlib import Path
 from typing import Any
 from minio import Minio
 from minio.error import S3Error
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 from app.config import settings
 from app.logging_config import get_logger
 logger = get_logger(__name__)
 class MinioStorage:
    def __init__(self, client: Minio | None = None) -> None:
        self.client = client or Minio(
            endpoint=settings.minio_endpoint,
            access_key=settings.minio_access_key,
            secret_key=settings.minio_secret_key,
            secure=settings.minio_secure,
            region=settings.minio_region,
        )
        self.originals_bucket = settings.minio_bucket_originals
        self.derived_bucket = settings.minio_bucket_derived
    def ensure_buckets(self) -> None:
        for bucket in (self.originals_bucket, self.derived_bucket):
            if not self.client.bucket_exists(bucket):
                logger.info("minio.create_bucket", bucket=bucket)
                self.client.make_bucket(bucket)
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=1, max=10),
        retry=retry_if_exception_type(S3Error),
        reraise=True,
    )
    def put_file(
        self,
        bucket: str,
        key: str,
        path: Path,
        content_type: str = "application/octet-stream",
        metadata: dict[str, str] | None = None,
    ) -> None:
        size = path.stat().st_size
        with path.open("rb") as f:
            self.client.put_object(
                bucket_name=bucket,
                object_name=key,
                data=f,
                length=size,
                content_type=content_type,
                metadata=metadata or {},
            )
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=1, max=10),
        retry=retry_if_exception_type(S3Error),
        reraise=True,
    )
    def put_bytes(
        self,
        bucket: str,
        key: str,
        data: bytes,
        content_type: str = "application/octet-stream",
        metadata: dict[str, str] | None = None,
    ) -> None:
        self.client.put_object(
            bucket_name=bucket,
            object_name=key,
            data=io.BytesIO(data),
            length=len(data),
            content_type=content_type,
            metadata=metadata or {},
        )
    def get_to_path(self, bucket: str, key: str, dest: Path) -> Path:
        dest.parent.mkdir(parents=True, exist_ok=True)
        self.client.fget_object(bucket, key, str(dest))
        return dest
    def exists(self, bucket: str, key: str) -> bool:
        try:
            self.client.stat_object(bucket, key)
            return True
        except S3Error as exc:
            if exc.code in {"NoSuchKey", "NoSuchObject"}:
                return False
            raise
    def health(self) -> dict[str, Any]:
        try:
            buckets = [b.name for b in self.client.list_buckets()]
            return {"status": "ok", "buckets": buckets}
        except Exception as exc:
            return {"status": "error", "error": str(exc)}
@lru_cache(maxsize=1)
 def get_storage() -> MinioStorage:
    return MinioStorage()
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/hashing.py
+++ b/app/utils/hashing.py
@@ -0,0 +1,21 @@
 """Streaming SHA256 hashing utilities for large files."""
 from __future__ import annotations
 import hashlib
 from pathlib import Path
 _CHUNK = 1024 * 1024  # 1 MiB
 def sha256_file(path: Path | str) -> str:
    """Compute SHA256 of a file in streaming mode (constant memory)."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for block in iter(lambda: f.read(_CHUNK), b""):
            h.update(block)
    return h.hexdigest()
 def sha256_bytes(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()
--- a/app/utils/language.py
+++ b/app/utils/language.py
@@ -0,0 +1,24 @@
 """Language detection helper - tolerant to short / mixed text."""
 from __future__ import annotations
 from langdetect import DetectorFactory, LangDetectException, detect_langs
 DetectorFactory.seed = 42
 def detect_language(text: str, min_chars: int = 40) -> str | None:
    """Return ISO 639-1 language code or ``None`` if undetectable."""
    if not text or len(text.strip()) < min_chars:
        return None
    try:
        ranked = detect_langs(text)
    except LangDetectException:
        return None
    if not ranked:
        return None
    return ranked[0].lang
 def has_cyrillic(text: str) -> bool:
    return any("Ѐ" <= ch <= "ӿ" for ch in text)
--- a/app/utils/pdf.py
+++ b/app/utils/pdf.py
@@ -0,0 +1,36 @@
 """PDF inspection helpers - decide whether OCR is required."""
 from __future__ import annotations
 from pathlib import Path
 import pikepdf
 from pdfminer.high_level import extract_text
 def page_count(path: Path | str) -> int:
    with pikepdf.open(str(path)) as pdf:
        return len(pdf.pages)
 def has_searchable_text(path: Path | str, sample_pages: int = 3, min_chars: int = 80) -> bool:
    """Cheap check: extract text from first ``sample_pages`` and require ``min_chars``.
    Returns False on any extraction error - safer to OCR than to skip.
    """
    try:
        text = extract_text(str(path), maxpages=sample_pages) or ""
    except Exception:
        return False
    return len(text.strip()) >= min_chars
 def is_pdf(path: Path | str) -> bool:
    p = Path(path)
    if not p.is_file() or p.suffix.lower() != ".pdf":
        return False
    try:
        with open(p, "rb") as f:
            return f.read(5) == b"%PDF-"
    except OSError:
        return False
--- a/app/utils/text_cleaning.py
+++ b/app/utils/text_cleaning.py
@@ -0,0 +1,69 @@
 """Conservative OCR text cleaning.
 Goals:
 - Drop hyphenation across line breaks (``инвен-\\nтарный`` -> ``инвентарный``).
 - Collapse runs of whitespace.
 - Strip control chars.
 - Preserve all non-letter characters that may carry meaning in legacy/technical
  documents: digits, punctuation, slashes, dashes, dots, parentheses, etc.
 We do NOT lowercase, transliterate, or strip punctuation here. ``normalize_for_search``
 produces a more aggressive form for indexing, but the original ``text`` is always
 kept untouched for citation/display.
 """
 from __future__ import annotations
 import re
 import unicodedata
 _CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
 _SOFT_HYPHEN = ""
 _MULTI_WS = re.compile(r"[ \t ]+")
 _MULTI_NL = re.compile(r"\n{3,}")
 _HYPHEN_LINEBREAK = re.compile(r"(\w)[-‐‑‒–]\n(\w)")
 _TRAILING_WS = re.compile(r"[ \t]+\n")
 def clean_ocr_text(text: str) -> str:
    if not text:
        return ""
    # Normalize unicode (NFC) to merge combining marks.
    text = unicodedata.normalize("NFC", text)
    text = text.replace(_SOFT_HYPHEN, "")
    text = _CONTROL_CHARS.sub("", text)
    text = _HYPHEN_LINEBREAK.sub(r"\1\2", text)
    text = _TRAILING_WS.sub("\n", text)
    text = _MULTI_WS.sub(" ", text)
    text = _MULTI_NL.sub("\n\n", text)
    return text.strip()
 _PUNCT_RUN = re.compile(r"[^\w\s/\-.,№#:()\[\]]+", flags=re.UNICODE)
 _WS_RUN = re.compile(r"\s+")
 def normalize_for_search(text: str) -> str:
    """Lowercase + light normalization for full-text indexing.
    Preserves digits, alphanumerics, slashes, dashes, dots, commas, ``№``, ``#``,
    colons and brackets - all of which appear in document/serial/standard codes.
    """
    if not text:
        return ""
    text = clean_ocr_text(text)
    text = text.lower()
    text = _PUNCT_RUN.sub(" ", text)
    text = _WS_RUN.sub(" ", text)
    return text.strip()
 def looks_garbled(text: str, threshold: float = 0.35) -> bool:
    """Heuristic: ratio of non-alphanumeric, non-whitespace chars."""
    if not text:
        return False
    total = len(text)
    if total < 20:
        return False
    bad = sum(1 for c in text if not (c.isalnum() or c.isspace() or c in ".,;:!?-/()[]№#"))
    return (bad / total) > threshold
--- a/app/workers/init.py
+++ b/app/workers/init.py
--- a/app/workers/celery_app.py
+++ b/app/workers/celery_app.py
@@ -0,0 +1,28 @@
 """Celery application instance."""
 from __future__ import annotations
 from celery import Celery
 from app.config import settings
 from app.logging_config import configure_logging
 configure_logging()
 celery_app = Celery(
    "legacyhub",
    broker=settings.redis_url,
    backend=settings.redis_url,
    include=["app.workers.tasks"],
 )
 celery_app.conf.update(
    task_acks_late=True,
    task_reject_on_worker_lost=True,
    task_track_started=True,
    worker_prefetch_multiplier=1,
    task_time_limit=settings.max_document_timeout_seconds * 4,
    task_soft_time_limit=settings.max_document_timeout_seconds * 3,
    timezone="UTC",
    enable_utc=True,
 )
--- a/app/workers/tasks.py
+++ b/app/workers/tasks.py
@@ -0,0 +1,22 @@
 """Celery tasks - thin wrappers over pipeline functions."""
 from __future__ import annotations
 import uuid
 from celery.utils.log import get_task_logger
 from app.workers.celery_app import celery_app
 logger = get_task_logger(__name__)
@celery_app.task(name="legacyhub.process_document", bind=True, max_retries=2, default_retry_delay=30)
 def process_document(self, document_id: str, run_id: str | None = None) -> dict:
    from app.ingestion.pipeline import process_document_id
    try:
        return process_document_id(uuid.UUID(document_id), uuid.UUID(run_id) if run_id else None)
    except Exception as exc:  # noqa: BLE001
        logger.exception("worker.process_failed", extra={"document_id": document_id})
        raise self.retry(exc=exc) from exc
--- a/data/input/.gitkeep
+++ b/data/input/.gitkeep
--- a/data/work/.gitkeep
+++ b/data/work/.gitkeep
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,186 @@
 name: legacyhub
 x-common-env: &common-env
  POSTGRES_HOST: ${POSTGRES_HOST:-postgres}
  POSTGRES_PORT: ${POSTGRES_PORT:-5432}
  POSTGRES_DB: ${POSTGRES_DB:-legacyhub}
  POSTGRES_USER: ${POSTGRES_USER:-legacyhub}
  POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-legacyhub}
  MINIO_ENDPOINT: ${MINIO_ENDPOINT:-minio:9000}
  MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-legacyhub}
  MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-legacyhub-secret}
  MINIO_BUCKET_ORIGINALS: ${MINIO_BUCKET_ORIGINALS:-legacyhub-originals}
  MINIO_BUCKET_DERIVED: ${MINIO_BUCKET_DERIVED:-legacyhub-derived}
  MINIO_SECURE: ${MINIO_SECURE:-false}
  OPENSEARCH_HOST: ${OPENSEARCH_HOST:-opensearch}
  OPENSEARCH_PORT: ${OPENSEARCH_PORT:-9200}
  OPENSEARCH_USE_SSL: ${OPENSEARCH_USE_SSL:-false}
  OPENSEARCH_VERIFY_CERTS: ${OPENSEARCH_VERIFY_CERTS:-false}
  OPENSEARCH_INDEX_CHUNKS: ${OPENSEARCH_INDEX_CHUNKS:-legacy_chunks}
  QDRANT_HOST: ${QDRANT_HOST:-qdrant}
  QDRANT_PORT: ${QDRANT_PORT:-6333}
  QDRANT_COLLECTION_CHUNKS: ${QDRANT_COLLECTION_CHUNKS:-legacy_chunks}
  REDIS_URL: ${REDIS_URL:-redis://redis:6379/0}
  OCR_LANGUAGES: ${OCR_LANGUAGES:-rus+eng}
  OCR_ENABLED: ${OCR_ENABLED:-true}
  DOCLING_OCR_ENABLED: ${DOCLING_OCR_ENABLED:-false}
  MAX_DOCUMENT_TIMEOUT_SECONDS: ${MAX_DOCUMENT_TIMEOUT_SECONDS:-180}
  EMBEDDING_MODEL: ${EMBEDDING_MODEL:-BAAI/bge-m3}
  EMBEDDING_DEVICE: ${EMBEDDING_DEVICE:-cpu}
  RERANKER_MODEL: ${RERANKER_MODEL:-BAAI/bge-reranker-v2-m3}
  RERANKER_DEVICE: ${RERANKER_DEVICE:-cpu}
  APP_LOG_LEVEL: ${APP_LOG_LEVEL:-INFO}
  APP_INPUT_DIR: /data/input
  APP_WORK_DIR: /data/work
 services:
  postgres:
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${POSTGRES_DB:-legacyhub}
      POSTGRES_USER: ${POSTGRES_USER:-legacyhub}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-legacyhub}
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-legacyhub} -d ${POSTGRES_DB:-legacyhub}"]
      interval: 10s
      timeout: 5s
      retries: 10
  minio:
    image: minio/minio:RELEASE.2024-08-29T01-40-52Z
    restart: unless-stopped
    command: server /data --console-address ":9001"
    environment:
      MINIO_ROOT_USER: ${MINIO_ACCESS_KEY:-legacyhub}
      MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY:-legacyhub-secret}
    ports:
      - "9000:9000"
      - "9001:9001"
    volumes:
      - minio_data:/data
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 10s
      timeout: 5s
      retries: 10
  opensearch:
    image: opensearchproject/opensearch:2.15.0
    restart: unless-stopped
    environment:
      - discovery.type=single-node
      - bootstrap.memory_lock=true
      - "OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g"
      - DISABLE_SECURITY_PLUGIN=true
      - DISABLE_INSTALL_DEMO_CONFIG=true
    ulimits:
      memlock:
        soft: -1
        hard: -1
      nofile:
        soft: 65536
        hard: 65536
    ports:
      - "9200:9200"
      - "9600:9600"
    volumes:
      - opensearch_data:/usr/share/opensearch/data
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:9200/_cluster/health | grep -q '\"status\":\"\\(green\\|yellow\\)\"'"]
      interval: 15s
      timeout: 10s
      retries: 20
  qdrant:
    image: qdrant/qdrant:v1.11.3
    restart: unless-stopped
    ports:
      - "6333:6333"
      - "6334:6334"
    volumes:
      - qdrant_data:/qdrant/storage
    healthcheck:
      test: ["CMD-SHELL", "bash -c '</dev/tcp/127.0.0.1/6333'"]
      interval: 15s
      timeout: 5s
      retries: 10
  redis:
    image: redis:7-alpine
    restart: unless-stopped
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 10
  api:
    build:
      context: .
      dockerfile: docker/Dockerfile
    image: legacyhub/api:latest
    restart: unless-stopped
    environment:
      <<: *common-env
      APP_HOST: 0.0.0.0
      APP_PORT: 8000
    command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
    ports:
      - "8000:8000"
    depends_on:
      postgres:
        condition: service_healthy
      minio:
        condition: service_healthy
      opensearch:
        condition: service_healthy
      qdrant:
        condition: service_healthy
      redis:
        condition: service_healthy
    volumes:
      - ./data/input:/data/input
      - ./data/work:/data/work
      - hf_cache:/root/.cache/huggingface
  worker:
    build:
      context: .
      dockerfile: docker/Dockerfile
    image: legacyhub/api:latest
    restart: unless-stopped
    environment:
      <<: *common-env
    command: ["celery", "-A", "app.workers.celery_app", "worker", "--loglevel=INFO", "--concurrency=2"]
    depends_on:
      postgres:
        condition: service_healthy
      minio:
        condition: service_healthy
      opensearch:
        condition: service_healthy
      qdrant:
        condition: service_healthy
      redis:
        condition: service_healthy
    volumes:
      - ./data/input:/data/input
      - ./data/work:/data/work
      - hf_cache:/root/.cache/huggingface
 volumes:
  postgres_data:
  minio_data:
  opensearch_data:
  qdrant_data:
  redis_data:
  hf_cache:
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,49 @@
 FROM python:3.11-slim-bookworm
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    DEBIAN_FRONTEND=noninteractive
 # System deps for OCRmyPDF + Tesseract (rus+eng) + Ghostscript + qpdf + image libs
 RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        curl \
        ca-certificates \
        ghostscript \
        qpdf \
        unpaper \
        pngquant \
        jbig2dec \
        libxml2-dev \
        libxslt1-dev \
        libffi-dev \
        libjpeg-dev \
        libopenjp2-7 \
        libtiff5-dev \
        zlib1g-dev \
        poppler-utils \
        libmagic1 \
        tesseract-ocr \
        tesseract-ocr-eng \
        tesseract-ocr-rus \
        tesseract-ocr-osd \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 COPY pyproject.toml /app/pyproject.toml
 RUN pip install --upgrade pip wheel setuptools && \
    pip install -e .
 COPY app /app/app
 COPY scripts /app/scripts
 COPY alembic.ini /app/alembic.ini
 RUN mkdir -p /data/input /data/work
 EXPOSE 8000
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/frontend/.env.example
+++ b/frontend/.env.example
@@ -0,0 +1,4 @@
 # Frontend environment
 VITE_API_BASE_URL=/api/v1
 VITE_USE_MOCK=true
 VITE_APP_NAME=LegacyHUB
--- a/frontend/.gitignore
+++ b/frontend/.gitignore
@@ -0,0 +1,7 @@
 node_modules
 dist
 .env
 .env.local
 .vite
 .DS_Store
 *.log
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -0,0 +1,140 @@
 # LegacyHUB · Frontend
 React + TypeScript + Vite frontend for **LegacyHUB**, the legacy-document
 indexing and AI search module of the **TeamHUB Suite**.
 This package ships:
 - the application shell (collapsible sidebar, top toolbar, breadcrumb nav,
  global ⌘K command palette, light/dark theme, notification center,
  user/profile menu);
 - nine pages: Dashboard, Documents, Ingestion Jobs, Search, Document Viewer,
  Tables & Figures, Quality Control, System Health, Settings;
 - a hybrid AI search workspace with semantic / lexical / hybrid modes, live
  suggestions, expandable filters, highlighted matches, reranker score
  visualization and side-by-side chunk preview;
 - typed service layer (`src/services/*`) with Axios + TanStack Query and a
  mock data backend you can toggle off when the backend is reachable.
 ## Stack
 | Concern        | Library                                |
 |----------------|-----------------------------------------|
 | Bundler        | Vite 5                                  |
 | Language       | TypeScript 5.6                          |
 | UI             | React 18                                |
 | Styling        | TailwindCSS 3 + custom design tokens    |
 | Components     | shadcn/ui primitives (Radix + cva)      |
 | Animation      | Framer Motion                           |
 | Charts         | Recharts                                |
 | Server state   | TanStack Query                          |
 | Client state   | Zustand                                 |
 | Routing        | React Router v6                         |
 | HTTP           | Axios                                   |
 | Icons          | lucide-react                            |
 | Toasts         | sonner                                  |
 | Virtualization | @tanstack/react-virtual                 |
 ## Quick start
 ```bash
 cd frontend
 cp .env.example .env       # VITE_USE_MOCK=true for offline UI development
 npm install
 npm run dev                # http://localhost:5173
 ```
 When the FastAPI backend is running, set `VITE_USE_MOCK=false` (or simply
 `VITE_API_BASE_URL=/api/v1` and let the Vite dev proxy at port 8000 handle
 routing). All API calls are isolated through `src/services/*.ts`.
 ## Architecture
 ```
 frontend/src/
  app/         RouterProvider, QueryClient, TooltipProvider, theme bootstrap
  pages/       One file per route — composed of widgets + primitives
  layouts/     AppShell, Sidebar (collapsible), Topbar, Breadcrumbs, ⌘K palette
  widgets/     Domain-specific composite components (KpiCard, Charts, Result cards,
               PdfPreviewPane, ChunkPreview, ServiceHealthCard, Timeline)
  components/
    ui/        shadcn-style primitives — Button, Card, Tabs, Dialog, Select,
               Tooltip, Popover, ScrollArea, Command, Skeleton, Progress, …
    common/    Domain primitives — Logo, StatusChip, ConfidenceMeter,
               QualityFlag, BlockTypeIcon, Highlight, EmptyState, PageHeader,
               ThemeToggle
  services/    Typed API layer (Axios) + TanStack hooks (one file per resource)
    mock/      Deterministic mock data + simulated latency
  hooks/       Wrappers around services exposing TanStack Query hooks
  stores/      Zustand stores: uiStore (theme, sidebar, palette), searchStore
  styles/      Tailwind layer + design tokens (HSL CSS variables)
  lib/         cn(), formatBytes/Number/Percent/Duration, relativeTime, etc.
 ```
 ### Design system
 - **Palette** — white / light-gray surfaces with a single restrained green
  accent (`--primary: 158 64% 32%`) matching QMS Hub.
 - **Surfaces** — three tiers: sunken (page background), default card, raised
  (popovers / dialogs). Glass surfaces via `backdrop-blur` for the topbar.
 - **Corners** — `--radius: 14px` produces soft, premium edges across every
  component.
 - **Shadows** — `shadow-soft` and `shadow-elevated` only. No harsh drop
  shadows.
 - **Typography** — Inter variable, optical sizes, tabular numbers for data
  cells, JetBrains Mono for IDs / paths / hashes.
 - **Motion** — Framer Motion `layoutId` for the active sidebar pill,
  `fade-in-up` for KPI cards, animated tabs and result expansion.
 - **States** — skeleton shimmer instead of spinners wherever possible.
 ### Key flows
 - **Hybrid search (`/search`)** — Debounced query → TanStack hook hits the
  backend (or mock). Results are virtualized, scored, optionally reranked.
  Picking a result hydrates a side-by-side ChunkPreview with the highlighted
  excerpt, a page thumbnail, citation metadata, and quality flags.
 - **Documents (`/documents`)** — Virtualized table (TanStack Virtual)
  supports thousands of rows. Filters: status, OCR threshold, "needs review",
  free-text search. Clicking a row opens the viewer.
 - **Document Viewer (`/viewer/:id`)** — Split layout. Left pane: PDF page
  thumbnails + synchronized large page preview with highlighted OCR blocks.
  Right pane: extracted chunks / tables / figures / metadata, kept in lock-step
  with the active page. Below: full pipeline timeline.
 - **Ingestion (`/ingestion`)** — Submit a folder path with `recursive` /
  `force` toggles → optimistic queue, run history table with live progress
  bars.
 - **Quality control (`/quality`)** — Three review queues (low confidence,
  handwriting, failed extraction) with reviewer actions and an audit log.
 ### Mock vs real backend
 `src/services/apiClient.ts` exports a constant `USE_MOCK`. When `true`, every
 service module short-circuits to `src/services/mock/mockData.ts` which
 generates deterministic, seeded data: 280 documents, dashboards, ingestion
 runs, search results, health and queue snapshots, and per-document detail
 (pages, chunks, tables, figures, timeline events).
 This lets the frontend be developed and demoed without the Python services
 running.
 ### Accessibility
 - All interactive elements use `ring-focus` (visible 2px primary ring).
 - Sidebar nav exposes tooltips when collapsed.
 - Keyboard: `Ctrl/Cmd + K` opens the global command palette.
 ### Responsive layout
 - ≥ 1280 px (xl, ultrawide) — three-column dashboards, side-by-side search.
 - 1024–1280 px (laptop) — two-column dashboards, stacked search.
 - < 1024 px — single column; sidebar collapses to icons only.
 ## Scripts
 ```bash
 npm run dev        # Vite dev server with /api proxy → :8000
 npm run build      # type-check + production bundle
 npm run preview    # preview build
 npm run lint
 npm run format
 ```
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -0,0 +1,19 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="UTF-8" />
    <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="theme-color" content="#059669" />
    <title>LegacyHUB · TeamHUB Suite</title>
    <link
      rel="stylesheet"
      href="https://rsms.me/inter/inter.css"
      crossorigin="anonymous"
    />
  </head>
  <body class="bg-background text-foreground antialiased">
    <div id="root"></div>
    <script type="module" src="/src/main.tsx"></script>
  </body>
 </html>
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -0,0 +1,56 @@
 {
  "name": "legacyhub-frontend",
  "private": true,
  "version": "0.1.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
    "build": "tsc -b && vite build",
    "preview": "vite preview --port 4173",
    "lint": "eslint . --ext .ts,.tsx",
    "format": "prettier --write \"src/**/*.{ts,tsx,css}\""
  },
  "dependencies": {
    "@radix-ui/react-dialog": "^1.1.2",
    "@radix-ui/react-dropdown-menu": "^2.1.2",
    "@radix-ui/react-popover": "^1.1.2",
    "@radix-ui/react-progress": "^1.1.0",
    "@radix-ui/react-scroll-area": "^1.2.0",
    "@radix-ui/react-select": "^2.1.2",
    "@radix-ui/react-separator": "^1.1.0",
    "@radix-ui/react-slot": "^1.1.0",
    "@radix-ui/react-switch": "^1.1.1",
    "@radix-ui/react-tabs": "^1.1.1",
    "@radix-ui/react-tooltip": "^1.1.3",
    "@tanstack/react-query": "^5.51.0",
    "@tanstack/react-virtual": "^3.10.6",
    "axios": "^1.7.7",
    "class-variance-authority": "^0.7.0",
    "clsx": "^2.1.1",
    "cmdk": "^1.0.0",
    "date-fns": "^3.6.0",
    "framer-motion": "^11.5.4",
    "lucide-react": "^0.451.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-router-dom": "^6.26.2",
    "recharts": "^2.13.0",
    "sonner": "^1.5.0",
    "tailwind-merge": "^2.5.2",
    "tailwindcss-animate": "^1.0.7",
    "zustand": "^4.5.5"
  },
  "devDependencies": {
    "@types/node": "^22.7.4",
    "@types/react": "^18.3.11",
    "@types/react-dom": "^18.3.0",
    "@vitejs/plugin-react": "^4.3.2",
    "autoprefixer": "^10.4.20",
    "eslint": "^9.11.1",
    "postcss": "^8.4.47",
    "prettier": "^3.3.3",
    "tailwindcss": "^3.4.13",
    "typescript": "^5.6.2",
    "vite": "^5.4.8"
  }
 }
--- a/frontend/postcss.config.js
+++ b/frontend/postcss.config.js
@@ -0,0 +1,6 @@
 export default {
  plugins: {
    tailwindcss: {},
    autoprefixer: {},
  },
 };
--- a/frontend/public/favicon.svg
+++ b/frontend/public/favicon.svg
@@ -0,0 +1,10 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
  <defs>
    <linearGradient id="g" x1="0" x2="32" y1="0" y2="32" gradientUnits="userSpaceOnUse">
      <stop offset="0" stop-color="#10b981"/>
      <stop offset="1" stop-color="#047857"/>
    </linearGradient>
  </defs>
  <rect width="32" height="32" rx="8" fill="url(#g)"/>
  <path d="M9 9.5h6.2c2.9 0 4.6 1.5 4.6 4 0 2-1.1 3.3-3 3.8l3.6 5.2h-3l-3.3-5h-2.5v5H9V9.5zm5.9 5.7c1.5 0 2.4-.7 2.4-2 0-1.2-.9-1.9-2.4-1.9h-3.2v3.9h3.2z" fill="#fff"/>
 </svg>
--- a/frontend/src/app/App.tsx
+++ b/frontend/src/app/App.tsx
@@ -0,0 +1,11 @@
 import { RouterProvider } from "react-router-dom";
 import { AppProviders } from "@/app/providers";
 import { router } from "@/app/router";
 export function App() {
  return (
    <AppProviders>
      <RouterProvider router={router} />
    </AppProviders>
  );
 }
--- a/frontend/src/app/providers.tsx
+++ b/frontend/src/app/providers.tsx
@@ -0,0 +1,34 @@
 import type { ReactNode } from "react";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import { TooltipProvider } from "@/components/ui/tooltip";
 import { Toaster } from "sonner";
 const queryClient = new QueryClient({
  defaultOptions: {
    queries: {
      retry: 1,
      refetchOnWindowFocus: false,
      staleTime: 30_000,
    },
  },
 });
 export function AppProviders({ children }: { children: ReactNode }) {
  return (
    <QueryClientProvider client={queryClient}>
      <TooltipProvider delayDuration={150}>
        {children}
        <Toaster
          position="bottom-right"
          richColors
          toastOptions={{
            classNames: {
              toast:
                "rounded-xl border border-border/70 bg-card text-foreground shadow-elevated",
            },
          }}
        />
      </TooltipProvider>
    </QueryClientProvider>
  );
 }
--- a/frontend/src/app/router.tsx
+++ b/frontend/src/app/router.tsx
@@ -0,0 +1,31 @@
 import { createBrowserRouter, Navigate } from "react-router-dom";
 import { AppShell } from "@/layouts/AppShell";
 import { DashboardPage } from "@/pages/DashboardPage";
 import { DocumentsPage } from "@/pages/DocumentsPage";
 import { IngestionJobsPage } from "@/pages/IngestionJobsPage";
 import { SearchPage } from "@/pages/SearchPage";
 import { DocumentViewerPage } from "@/pages/DocumentViewerPage";
 import { TablesFiguresPage } from "@/pages/TablesFiguresPage";
 import { QualityControlPage } from "@/pages/QualityControlPage";
 import { SystemHealthPage } from "@/pages/SystemHealthPage";
 import { SettingsPage } from "@/pages/SettingsPage";
 export const router = createBrowserRouter([
  {
    element: <AppShell />,
    children: [
      { path: "/", element: <DashboardPage /> },
      { path: "/documents", element: <DocumentsPage /> },
      { path: "/ingestion", element: <IngestionJobsPage /> },
      { path: "/search", element: <SearchPage /> },
      { path: "/viewer", element: <DocumentViewerPage /> },
      { path: "/viewer/:id", element: <DocumentViewerPage /> },
      { path: "/tables-figures", element: <TablesFiguresPage /> },
      { path: "/quality", element: <QualityControlPage /> },
      { path: "/health", element: <SystemHealthPage /> },
      { path: "/settings", element: <SettingsPage /> },
      { path: "*", element: <Navigate to="/" replace /> },
    ],
  },
 ]);
--- a/frontend/src/components/common/BlockTypeIcon.tsx
+++ b/frontend/src/components/common/BlockTypeIcon.tsx
@@ -0,0 +1,44 @@
 import {
  AlignLeft,
  Heading,
  List,
  Table as TableIcon,
  Image as ImageIcon,
  PenLine,
  Hash,
  HelpCircle,
 } from "lucide-react";
 import { cn } from "@/lib/utils";
 const MAP: Record<string, { icon: typeof AlignLeft; tone: string }> = {
  title: { icon: Hash, tone: "text-primary" },
  heading: { icon: Heading, tone: "text-primary" },
  paragraph: { icon: AlignLeft, tone: "text-muted-foreground" },
  list: { icon: List, tone: "text-muted-foreground" },
  table: { icon: TableIcon, tone: "text-warning" },
  figure_caption: { icon: ImageIcon, tone: "text-primary-600" },
  figure_description: { icon: ImageIcon, tone: "text-primary-600" },
  handwriting: { icon: PenLine, tone: "text-destructive" },
  unknown: { icon: HelpCircle, tone: "text-muted-foreground" },
 };
 export function BlockTypeIcon({
  type,
  className,
 }: {
  type: string;
  className?: string;
 }) {
  const m = MAP[type] ?? MAP.unknown;
  const Icon = m.icon;
  return <Icon className={cn("h-3.5 w-3.5", m.tone, className)} aria-hidden />;
 }
 export function BlockTypeLabel({ type }: { type: string }) {
  return (
    <span className="inline-flex items-center gap-1 rounded-md border border-border/70 bg-muted/30 px-1.5 py-0.5 text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
      <BlockTypeIcon type={type} />
      {type.replace(/_/g, " ")}
    </span>
  );
 }
--- a/frontend/src/components/common/ConfidenceMeter.tsx
+++ b/frontend/src/components/common/ConfidenceMeter.tsx
@@ -0,0 +1,38 @@
 import { cn } from "@/lib/utils";
 export function ConfidenceMeter({
  value,
  showLabel = true,
  className,
 }: {
  value: number | null | undefined;
  showLabel?: boolean;
  className?: string;
 }) {
  const pct = value == null ? null : Math.round(value * 100);
  const tone =
    pct == null
      ? "bg-muted-foreground/30"
      : pct >= 85
      ? "bg-success"
      : pct >= 65
      ? "bg-primary"
      : pct >= 45
      ? "bg-warning"
      : "bg-destructive";
  return (
    <div className={cn("flex items-center gap-2", className)}>
      <div className="h-1.5 w-16 overflow-hidden rounded-full bg-muted">
        <div
          className={cn("h-full transition-all", tone)}
          style={{ width: pct == null ? "100%" : `${pct}%` }}
        />
      </div>
      {showLabel && (
        <span className="font-mono text-xs tabular-nums text-muted-foreground">
          {pct == null ? "—" : `${pct}%`}
        </span>
      )}
    </div>
  );
 }
--- a/frontend/src/components/common/EmptyState.tsx
+++ b/frontend/src/components/common/EmptyState.tsx
@@ -0,0 +1,38 @@
 import type { ReactNode } from "react";
 import { cn } from "@/lib/utils";
 export function EmptyState({
  icon,
  title,
  description,
  action,
  className,
 }: {
  icon?: ReactNode;
  title: string;
  description?: string;
  action?: ReactNode;
  className?: string;
 }) {
  return (
    <div
      className={cn(
        "panel flex flex-col items-center justify-center gap-3 px-8 py-14 text-center",
        className
      )}
    >
      {icon && (
        <div className="rounded-2xl border border-border/70 bg-accent/40 p-3 text-primary">
          {icon}
        </div>
      )}
      <div className="space-y-1">
        <div className="text-sm font-semibold">{title}</div>
        {description && (
          <div className="max-w-sm text-xs text-muted-foreground">{description}</div>
        )}
      </div>
      {action}
    </div>
  );
 }
--- a/frontend/src/components/common/Highlight.tsx
+++ b/frontend/src/components/common/Highlight.tsx
@@ -0,0 +1,45 @@
 import { useMemo } from "react";
 export function Highlight({
  text,
  query,
 }: {
  text: string;
  query: string;
 }) {
  const parts = useMemo(() => splitHighlight(text, query), [text, query]);
  return (
    <>
      {parts.map((p, i) =>
        p.match ? (
          <mark
            key={i}
            className="rounded-[3px] bg-primary/20 px-0.5 text-primary-700 dark:text-primary-100"
          >
            {p.text}
          </mark>
        ) : (
          <span key={i}>{p.text}</span>
        )
      )}
    </>
  );
 }
 function splitHighlight(text: string, query: string): { text: string; match: boolean }[] {
  const q = query.trim();
  if (!q) return [{ text, match: false }];
  const tokens = Array.from(new Set(q.split(/\s+/).filter((t) => t.length >= 2)));
  if (tokens.length === 0) return [{ text, match: false }];
  const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
  const re = new RegExp(`(${escaped.join("|")})`, "gi");
  const out: { text: string; match: boolean }[] = [];
  let last = 0;
  for (const m of text.matchAll(re)) {
    if (m.index! > last) out.push({ text: text.slice(last, m.index), match: false });
    out.push({ text: m[0], match: true });
    last = m.index! + m[0].length;
  }
  if (last < text.length) out.push({ text: text.slice(last), match: false });
  return out;
 }
--- a/frontend/src/components/common/Logo.tsx
+++ b/frontend/src/components/common/Logo.tsx
@@ -0,0 +1,23 @@
 import { cn } from "@/lib/utils";
 export function Logo({ className, compact = false }: { className?: string; compact?: boolean }) {
  return (
    <div className={cn("flex items-center gap-2.5", className)}>
      <div className="relative h-8 w-8 shrink-0 overflow-hidden rounded-lg shadow-soft">
        <div className="absolute inset-0 bg-gradient-to-br from-primary-500 to-primary-700" />
        <div className="absolute inset-0 grid place-items-center text-[15px] font-semibold tracking-tight text-white">
          L
        </div>
        <div className="pointer-events-none absolute inset-0 ring-1 ring-inset ring-white/15" />
      </div>
      {!compact && (
        <div className="leading-tight">
          <div className="text-sm font-semibold tracking-tight text-foreground">LegacyHUB</div>
          <div className="text-[10px] uppercase tracking-[0.18em] text-muted-foreground">
            TeamHUB Suite
          </div>
        </div>
      )}
    </div>
  );
 }
--- a/frontend/src/components/common/PageHeader.tsx
+++ b/frontend/src/components/common/PageHeader.tsx
@@ -0,0 +1,28 @@
 import type { ReactNode } from "react";
 import { cn } from "@/lib/utils";
 export function PageHeader({
  title,
  description,
  actions,
  className,
 }: {
  title: string;
  description?: string;
  actions?: ReactNode;
  className?: string;
 }) {
  return (
    <header className={cn("flex flex-col gap-2 sm:flex-row sm:items-end sm:justify-between", className)}>
      <div className="space-y-1">
        <h1 className="text-2xl font-semibold tracking-tight text-foreground text-balance">
          {title}
        </h1>
        {description && (
          <p className="max-w-2xl text-sm text-muted-foreground">{description}</p>
        )}
      </div>
      {actions && <div className="flex flex-wrap items-center gap-2">{actions}</div>}
    </header>
  );
 }
--- a/frontend/src/components/common/QualityFlag.tsx
+++ b/frontend/src/components/common/QualityFlag.tsx
@@ -0,0 +1,74 @@
 import { AlertTriangle, CheckCircle2, FileWarning, Hash, Image, PenLine, Table } from "lucide-react";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { cn } from "@/lib/utils";
 const FLAGS: Record<
  string,
  { label: string; icon: typeof AlertTriangle; tone: string }
 > = {
  low_ocr_confidence: { label: "Low OCR confidence", icon: AlertTriangle, tone: "text-warning" },
  very_short_text: { label: "Very short text", icon: Hash, tone: "text-muted-foreground" },
  possible_garbled_text: { label: "Possible garbled text", icon: FileWarning, tone: "text-destructive" },
  table_detected: { label: "Table detected", icon: Table, tone: "text-primary-600" },
  figure_detected: { label: "Figure detected", icon: Image, tone: "text-primary-600" },
  handwriting_detected: { label: "Handwriting detected", icon: PenLine, tone: "text-destructive" },
  needs_manual_review: { label: "Needs manual review", icon: AlertTriangle, tone: "text-warning" },
 };
 export function QualityFlags({
  flags,
  compact = false,
  className,
 }: {
  flags: Record<string, boolean | undefined> | null | undefined;
  compact?: boolean;
  className?: string;
 }) {
  const active = Object.entries(flags ?? {})
    .filter(([k, v]) => v && FLAGS[k])
    .map(([k]) => k);
  if (active.length === 0) {
    return (
      <span className={cn("inline-flex items-center gap-1 text-xs text-success", className)}>
        <CheckCircle2 className="h-3.5 w-3.5" />
        Clean
      </span>
    );
  }
  return (
    <div className={cn("flex flex-wrap items-center gap-1.5", className)}>
      {active.map((key) => {
        const f = FLAGS[key];
        const Icon = f.icon;
        if (compact) {
          return (
            <Tooltip key={key}>
              <TooltipTrigger asChild>
                <span
                  className={cn(
                    "inline-flex h-6 w-6 items-center justify-center rounded-full border border-border/60 bg-card",
                    f.tone
                  )}
                >
                  <Icon className="h-3.5 w-3.5" />
                </span>
              </TooltipTrigger>
              <TooltipContent>{f.label}</TooltipContent>
            </Tooltip>
          );
        }
        return (
          <span
            key={key}
            className="inline-flex items-center gap-1 rounded-md border border-border/60 bg-muted/40 px-2 py-0.5 text-[11px] font-medium"
          >
            <Icon className={cn("h-3 w-3", f.tone)} />
            <span className="text-muted-foreground">{f.label}</span>
          </span>
        );
      })}
    </div>
  );
 }
--- a/frontend/src/components/common/StatusChip.tsx
+++ b/frontend/src/components/common/StatusChip.tsx
@@ -0,0 +1,48 @@
 import { cn } from "@/lib/utils";
 const TONE: Record<string, { dot: string; text: string; bg: string }> = {
  ok: { dot: "bg-success", text: "text-success", bg: "bg-success/10" },
  active: { dot: "bg-primary", text: "text-primary-700 dark:text-primary-100", bg: "bg-primary/10" },
  warning: { dot: "bg-warning", text: "text-warning", bg: "bg-warning/10" },
  error: { dot: "bg-destructive", text: "text-destructive", bg: "bg-destructive/10" },
  muted: { dot: "bg-muted-foreground", text: "text-muted-foreground", bg: "bg-muted/60" },
 };
 export type StatusTone = keyof typeof TONE;
 export function StatusChip({
  tone = "muted",
  label,
  className,
 }: {
  tone?: StatusTone;
  label: string;
  className?: string;
 }) {
  const t = TONE[tone];
  return (
    <span
      className={cn(
        "inline-flex items-center gap-1.5 rounded-full px-2.5 py-0.5 text-xs font-medium",
        t.bg,
        t.text,
        className
      )}
    >
      <span className={cn("h-1.5 w-1.5 rounded-full", t.dot)} />
      {label}
    </span>
  );
 }
 export function statusToTone(status: string): StatusTone {
  const s = status?.toUpperCase();
  if (!s) return "muted";
  if (s.includes("FAILED") || s === "ERROR") return "error";
  if (s === "INDEXING_COMPLETED" || s === "OK") return "ok";
  if (s === "DISCOVERED" || s.endsWith("_STARTED") || s === "PENDING") return "active";
  if (s === "OCR_COMPLETED" || s === "EXTRACTION_COMPLETED" || s === "CHUNKING_COMPLETED")
    return "active";
  if (s === "DEGRADED") return "warning";
  return "muted";
 }
--- a/frontend/src/components/common/ThemeToggle.tsx
+++ b/frontend/src/components/common/ThemeToggle.tsx
@@ -0,0 +1,28 @@
 import { Moon, Sun, MonitorSmartphone } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import { useUiStore } from "@/stores/uiStore";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 export function ThemeToggle() {
  const theme = useUiStore((s) => s.theme);
  const setTheme = useUiStore((s) => s.setTheme);
  const next = theme === "light" ? "dark" : theme === "dark" ? "system" : "light";
  const Icon = theme === "light" ? Sun : theme === "dark" ? Moon : MonitorSmartphone;
  return (
    <Tooltip>
      <TooltipTrigger asChild>
        <Button
          variant="ghost"
          size="icon-sm"
          aria-label="Toggle theme"
          onClick={() => setTheme(next)}
        >
          <Icon className="h-4 w-4" />
        </Button>
      </TooltipTrigger>
      <TooltipContent>Theme: {theme}</TooltipContent>
    </Tooltip>
  );
 }
--- a/frontend/src/components/ui/badge.tsx
+++ b/frontend/src/components/ui/badge.tsx
@@ -0,0 +1,29 @@
 import * as React from "react";
 import { cva, type VariantProps } from "class-variance-authority";
 import { cn } from "@/lib/utils";
 const badgeVariants = cva(
  "inline-flex items-center gap-1 rounded-full border px-2.5 py-0.5 text-xs font-medium transition-colors",
  {
    variants: {
      variant: {
        default: "border-transparent bg-primary/12 text-primary-700 dark:text-primary-100",
        outline: "border-border bg-transparent text-foreground",
        muted: "border-transparent bg-muted text-muted-foreground",
        success: "border-transparent bg-success/15 text-success",
        warning: "border-transparent bg-warning/15 text-warning",
        destructive: "border-transparent bg-destructive/15 text-destructive",
        accent: "border-transparent bg-accent text-accent-foreground",
      },
    },
    defaultVariants: { variant: "default" },
  }
 );
 export interface BadgeProps
  extends React.HTMLAttributes<HTMLDivElement>,
    VariantProps<typeof badgeVariants> {}
 export function Badge({ className, variant, ...props }: BadgeProps) {
  return <div className={cn(badgeVariants({ variant }), className)} {...props} />;
 }
--- a/frontend/src/components/ui/button.tsx
+++ b/frontend/src/components/ui/button.tsx
@@ -0,0 +1,56 @@
 import * as React from "react";
 import { Slot } from "@radix-ui/react-slot";
 import { cva, type VariantProps } from "class-variance-authority";
 import { cn } from "@/lib/utils";
 const buttonVariants = cva(
  "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-lg text-sm font-medium ring-focus transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0",
  {
    variants: {
      variant: {
        default:
          "bg-primary text-primary-foreground shadow-soft hover:bg-primary-700 active:translate-y-[0.5px]",
        secondary:
          "bg-secondary text-secondary-foreground hover:bg-secondary/80 border border-border/70",
        outline:
          "border border-border bg-transparent hover:bg-muted text-foreground",
        ghost:
          "hover:bg-muted text-foreground",
        subtle:
          "bg-accent text-accent-foreground hover:bg-accent/70",
        destructive:
          "bg-destructive text-destructive-foreground hover:bg-destructive/90",
        link: "text-primary underline-offset-4 hover:underline",
      },
      size: {
        sm: "h-8 px-3 text-xs",
        default: "h-9 px-4",
        lg: "h-11 px-6 text-base rounded-xl",
        icon: "h-9 w-9",
        "icon-sm": "h-8 w-8",
      },
    },
    defaultVariants: {
      variant: "default",
      size: "default",
    },
  }
 );
 export interface ButtonProps
  extends React.ButtonHTMLAttributes<HTMLButtonElement>,
    VariantProps<typeof buttonVariants> {
  asChild?: boolean;
 }
 export const Button = React.forwardRef<HTMLButtonElement, ButtonProps>(
  ({ className, variant, size, asChild = false, ...props }, ref) => {
    const Comp = asChild ? Slot : "button";
    return (
      <Comp ref={ref} className={cn(buttonVariants({ variant, size }), className)} {...props} />
    );
  }
 );
 Button.displayName = "Button";
 export { buttonVariants };
--- a/frontend/src/components/ui/card.tsx
+++ b/frontend/src/components/ui/card.tsx
@@ -0,0 +1,53 @@
 import * as React from "react";
 import { cn } from "@/lib/utils";
 export const Card = React.forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
  ({ className, ...props }, ref) => (
    <div ref={ref} className={cn("panel", className)} {...props} />
  )
 );
 Card.displayName = "Card";
 export const CardHeader = React.forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
  ({ className, ...props }, ref) => (
    <div ref={ref} className={cn("flex flex-col gap-1 p-5", className)} {...props} />
  )
 );
 CardHeader.displayName = "CardHeader";
 export const CardTitle = React.forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
  ({ className, ...props }, ref) => (
    <div
      ref={ref}
      className={cn("text-base font-semibold tracking-tight text-foreground", className)}
      {...props}
    />
  )
 );
 CardTitle.displayName = "CardTitle";
 export const CardDescription = React.forwardRef<
  HTMLParagraphElement,
  React.HTMLAttributes<HTMLParagraphElement>
 >(({ className, ...props }, ref) => (
  <p ref={ref} className={cn("text-sm text-muted-foreground", className)} {...props} />
 ));
 CardDescription.displayName = "CardDescription";
 export const CardContent = React.forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
  ({ className, ...props }, ref) => (
    <div ref={ref} className={cn("px-5 pb-5", className)} {...props} />
  )
 );
 CardContent.displayName = "CardContent";
 export const CardFooter = React.forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
  ({ className, ...props }, ref) => (
    <div
      ref={ref}
      className={cn("flex items-center justify-between gap-2 border-t border-border/60 px-5 py-3", className)}
      {...props}
    />
  )
 );
 CardFooter.displayName = "CardFooter";
--- a/frontend/src/components/ui/command.tsx
+++ b/frontend/src/components/ui/command.tsx
@@ -0,0 +1,90 @@
 import * as React from "react";
 import { Command as CommandPrimitive } from "cmdk";
 import { Search } from "lucide-react";
 import { cn } from "@/lib/utils";
 export const Command = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive>
 >(({ className, ...props }, ref) => (
  <CommandPrimitive
    ref={ref}
    className={cn("flex h-full w-full flex-col overflow-hidden rounded-xl bg-popover", className)}
    {...props}
  />
 ));
 Command.displayName = "Command";
 export const CommandInput = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive.Input>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive.Input>
 >(({ className, ...props }, ref) => (
  <div className="flex items-center gap-2 border-b border-border/70 px-3" cmdk-input-wrapper="">
    <Search className="h-4 w-4 text-muted-foreground" />
    <CommandPrimitive.Input
      ref={ref}
      className={cn(
        "flex h-10 w-full bg-transparent text-sm outline-none placeholder:text-muted-foreground/70",
        className
      )}
      {...props}
    />
  </div>
 ));
 CommandInput.displayName = "CommandInput";
 export const CommandList = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive.List>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive.List>
 >(({ className, ...props }, ref) => (
  <CommandPrimitive.List
    ref={ref}
    className={cn("max-h-[320px] overflow-y-auto overflow-x-hidden p-1 scrollbar-thin", className)}
    {...props}
  />
 ));
 CommandList.displayName = "CommandList";
 export const CommandEmpty = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive.Empty>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive.Empty>
 >((props, ref) => (
  <CommandPrimitive.Empty ref={ref} className="py-6 text-center text-sm text-muted-foreground" {...props} />
 ));
 CommandEmpty.displayName = "CommandEmpty";
 export const CommandGroup = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive.Group>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive.Group>
 >(({ className, ...props }, ref) => (
  <CommandPrimitive.Group
    ref={ref}
    className={cn(
      "overflow-hidden p-1 text-foreground [&_[cmdk-group-heading]]:px-2 [&_[cmdk-group-heading]]:py-1 [&_[cmdk-group-heading]]:text-xs [&_[cmdk-group-heading]]:font-medium [&_[cmdk-group-heading]]:uppercase [&_[cmdk-group-heading]]:tracking-wide [&_[cmdk-group-heading]]:text-muted-foreground",
      className
    )}
    {...props}
  />
 ));
 CommandGroup.displayName = "CommandGroup";
 export const CommandItem = React.forwardRef<
  React.ElementRef<typeof CommandPrimitive.Item>,
  React.ComponentPropsWithoutRef<typeof CommandPrimitive.Item>
 >(({ className, ...props }, ref) => (
  <CommandPrimitive.Item
    ref={ref}
    className={cn(
      "relative flex cursor-pointer select-none items-center gap-2 rounded-md px-2.5 py-1.5 text-sm outline-none transition-colors",
      "data-[selected=true]:bg-muted data-[selected=true]:text-foreground",
      "aria-disabled:pointer-events-none aria-disabled:opacity-50",
      className
    )}
    {...props}
  />
 ));
 CommandItem.displayName = "CommandItem";
 export const CommandShortcut = ({ className, ...props }: React.HTMLAttributes<HTMLSpanElement>) => (
  <span className={cn("ml-auto text-xs tracking-widest text-muted-foreground", className)} {...props} />
 );
--- a/frontend/src/components/ui/dialog.tsx
+++ b/frontend/src/components/ui/dialog.tsx
@@ -0,0 +1,76 @@
 import * as React from "react";
 import * as DialogPrimitive from "@radix-ui/react-dialog";
 import { X } from "lucide-react";
 import { cn } from "@/lib/utils";
 export const Dialog = DialogPrimitive.Root;
 export const DialogTrigger = DialogPrimitive.Trigger;
 export const DialogPortal = DialogPrimitive.Portal;
 export const DialogClose = DialogPrimitive.Close;
 export const DialogOverlay = React.forwardRef<
  React.ElementRef<typeof DialogPrimitive.Overlay>,
  React.ComponentPropsWithoutRef<typeof DialogPrimitive.Overlay>
 >(({ className, ...props }, ref) => (
  <DialogPrimitive.Overlay
    ref={ref}
    className={cn(
      "fixed inset-0 z-50 bg-black/40 backdrop-blur-sm",
      "data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=closed]:animate-out data-[state=closed]:fade-out-0",
      className
    )}
    {...props}
  />
 ));
 DialogOverlay.displayName = DialogPrimitive.Overlay.displayName;
 export const DialogContent = React.forwardRef<
  React.ElementRef<typeof DialogPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof DialogPrimitive.Content>
 >(({ className, children, ...props }, ref) => (
  <DialogPortal>
    <DialogOverlay />
    <DialogPrimitive.Content
      ref={ref}
      className={cn(
        "fixed left-1/2 top-1/2 z-50 grid w-full max-w-lg -translate-x-1/2 -translate-y-1/2 gap-4 panel-raised p-6",
        "data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95",
        className
      )}
      {...props}
    >
      {children}
      <DialogPrimitive.Close className="absolute right-4 top-4 rounded-md p-1 text-muted-foreground hover:bg-muted ring-focus">
        <X className="h-4 w-4" />
        <span className="sr-only">Close</span>
      </DialogPrimitive.Close>
    </DialogPrimitive.Content>
  </DialogPortal>
 ));
 DialogContent.displayName = DialogPrimitive.Content.displayName;
 export const DialogHeader = ({ className, ...props }: React.HTMLAttributes<HTMLDivElement>) => (
  <div className={cn("flex flex-col gap-1.5", className)} {...props} />
 );
 export const DialogTitle = React.forwardRef<
  React.ElementRef<typeof DialogPrimitive.Title>,
  React.ComponentPropsWithoutRef<typeof DialogPrimitive.Title>
 >(({ className, ...props }, ref) => (
  <DialogPrimitive.Title
    ref={ref}
    className={cn("text-base font-semibold tracking-tight", className)}
    {...props}
  />
 ));
 DialogTitle.displayName = DialogPrimitive.Title.displayName;
 export const DialogDescription = React.forwardRef<
  React.ElementRef<typeof DialogPrimitive.Description>,
  React.ComponentPropsWithoutRef<typeof DialogPrimitive.Description>
 >(({ className, ...props }, ref) => (
  <DialogPrimitive.Description
    ref={ref}
    className={cn("text-sm text-muted-foreground", className)}
    {...props}
  />
 ));
 DialogDescription.displayName = DialogPrimitive.Description.displayName;
--- a/frontend/src/components/ui/dropdown-menu.tsx
+++ b/frontend/src/components/ui/dropdown-menu.tsx
@@ -0,0 +1,113 @@
 import * as React from "react";
 import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu";
 import { Check, ChevronRight } from "lucide-react";
 import { cn } from "@/lib/utils";
 export const DropdownMenu = DropdownMenuPrimitive.Root;
 export const DropdownMenuTrigger = DropdownMenuPrimitive.Trigger;
 export const DropdownMenuGroup = DropdownMenuPrimitive.Group;
 export const DropdownMenuPortal = DropdownMenuPrimitive.Portal;
 export const DropdownMenuSeparator = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.Separator>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Separator>
 >(({ className, ...props }, ref) => (
  <DropdownMenuPrimitive.Separator
    ref={ref}
    className={cn("-mx-1 my-1 h-px bg-border/70", className)}
    {...props}
  />
 ));
 DropdownMenuSeparator.displayName = DropdownMenuPrimitive.Separator.displayName;
 export const DropdownMenuContent = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Content>
 >(({ className, sideOffset = 6, ...props }, ref) => (
  <DropdownMenuPrimitive.Portal>
    <DropdownMenuPrimitive.Content
      ref={ref}
      sideOffset={sideOffset}
      className={cn(
        "z-50 min-w-[12rem] overflow-hidden rounded-xl border border-border/70 bg-popover p-1 text-popover-foreground shadow-elevated",
        "data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95",
        className
      )}
      {...props}
    />
  </DropdownMenuPrimitive.Portal>
 ));
 DropdownMenuContent.displayName = DropdownMenuPrimitive.Content.displayName;
 export const DropdownMenuItem = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.Item>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Item> & { inset?: boolean }
 >(({ className, inset, ...props }, ref) => (
  <DropdownMenuPrimitive.Item
    ref={ref}
    className={cn(
      "relative flex cursor-pointer select-none items-center gap-2 rounded-md px-2.5 py-1.5 text-sm outline-none transition-colors",
      "hover:bg-muted focus:bg-muted",
      inset && "pl-8",
      className
    )}
    {...props}
  />
 ));
 DropdownMenuItem.displayName = DropdownMenuPrimitive.Item.displayName;
 export const DropdownMenuLabel = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.Label>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Label>
 >(({ className, ...props }, ref) => (
  <DropdownMenuPrimitive.Label
    ref={ref}
    className={cn("px-2.5 py-1.5 text-xs font-medium uppercase tracking-wide text-muted-foreground", className)}
    {...props}
  />
 ));
 DropdownMenuLabel.displayName = DropdownMenuPrimitive.Label.displayName;
 export const DropdownMenuCheckboxItem = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.CheckboxItem>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.CheckboxItem>
 >(({ className, children, checked, ...props }, ref) => (
  <DropdownMenuPrimitive.CheckboxItem
    ref={ref}
    checked={checked}
    className={cn(
      "relative flex cursor-pointer select-none items-center rounded-md py-1.5 pl-8 pr-2 text-sm outline-none transition-colors hover:bg-muted",
      className
    )}
    {...props}
  >
    <span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
      <DropdownMenuPrimitive.ItemIndicator>
        <Check className="h-3.5 w-3.5" />
      </DropdownMenuPrimitive.ItemIndicator>
    </span>
    {children}
  </DropdownMenuPrimitive.CheckboxItem>
 ));
 DropdownMenuCheckboxItem.displayName = DropdownMenuPrimitive.CheckboxItem.displayName;
 export const DropdownMenuShortcut = ({ className, ...props }: React.HTMLAttributes<HTMLSpanElement>) => (
  <span className={cn("ml-auto text-xs tracking-widest text-muted-foreground", className)} {...props} />
 );
 export const DropdownMenuSubTrigger = React.forwardRef<
  React.ElementRef<typeof DropdownMenuPrimitive.SubTrigger>,
  React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.SubTrigger>
 >(({ className, children, ...props }, ref) => (
  <DropdownMenuPrimitive.SubTrigger
    ref={ref}
    className={cn(
      "flex cursor-pointer select-none items-center rounded-md px-2.5 py-1.5 text-sm outline-none hover:bg-muted",
      className
    )}
    {...props}
  >
    {children}
    <ChevronRight className="ml-auto h-4 w-4" />
  </DropdownMenuPrimitive.SubTrigger>
 ));
 DropdownMenuSubTrigger.displayName = DropdownMenuPrimitive.SubTrigger.displayName;
--- a/frontend/src/components/ui/input.tsx
+++ b/frontend/src/components/ui/input.tsx
@@ -0,0 +1,21 @@
 import * as React from "react";
 import { cn } from "@/lib/utils";
 export type InputProps = React.InputHTMLAttributes<HTMLInputElement>;
 export const Input = React.forwardRef<HTMLInputElement, InputProps>(
  ({ className, type = "text", ...props }, ref) => (
    <input
      ref={ref}
      type={type}
      className={cn(
        "flex h-9 w-full rounded-lg border border-input bg-surface px-3 py-1.5 text-sm shadow-sm transition-colors",
        "placeholder:text-muted-foreground/70",
        "ring-focus disabled:cursor-not-allowed disabled:opacity-50",
        className
      )}
      {...props}
    />
  )
 );
 Input.displayName = "Input";
--- a/frontend/src/components/ui/popover.tsx
+++ b/frontend/src/components/ui/popover.tsx
@@ -0,0 +1,27 @@
 import * as React from "react";
 import * as PopoverPrimitive from "@radix-ui/react-popover";
 import { cn } from "@/lib/utils";
 export const Popover = PopoverPrimitive.Root;
 export const PopoverTrigger = PopoverPrimitive.Trigger;
 export const PopoverAnchor = PopoverPrimitive.Anchor;
 export const PopoverContent = React.forwardRef<
  React.ElementRef<typeof PopoverPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof PopoverPrimitive.Content>
 >(({ className, align = "center", sideOffset = 6, ...props }, ref) => (
  <PopoverPrimitive.Portal>
    <PopoverPrimitive.Content
      ref={ref}
      align={align}
      sideOffset={sideOffset}
      className={cn(
        "z-50 w-72 rounded-xl border border-border/70 bg-popover p-3 text-popover-foreground shadow-elevated",
        "data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95",
        className
      )}
      {...props}
    />
  </PopoverPrimitive.Portal>
 ));
 PopoverContent.displayName = PopoverPrimitive.Content.displayName;
--- a/frontend/src/components/ui/progress.tsx
+++ b/frontend/src/components/ui/progress.tsx
@@ -0,0 +1,22 @@
 import * as React from "react";
 import * as ProgressPrimitive from "@radix-ui/react-progress";
 import { cn } from "@/lib/utils";
 export const Progress = React.forwardRef<
  React.ElementRef<typeof ProgressPrimitive.Root>,
  React.ComponentPropsWithoutRef<typeof ProgressPrimitive.Root> & {
    indicatorClassName?: string;
  }
 >(({ className, value, indicatorClassName, ...props }, ref) => (
  <ProgressPrimitive.Root
    ref={ref}
    className={cn("relative h-2 w-full overflow-hidden rounded-full bg-muted", className)}
    {...props}
  >
    <ProgressPrimitive.Indicator
      className={cn("h-full w-full flex-1 bg-primary transition-all", indicatorClassName)}
      style={{ transform: `translateX(-${100 - (value ?? 0)}%)` }}
    />
  </ProgressPrimitive.Root>
 ));
 Progress.displayName = "Progress";
--- a/frontend/src/components/ui/scroll-area.tsx
+++ b/frontend/src/components/ui/scroll-area.tsx
@@ -0,0 +1,36 @@
 import * as React from "react";
 import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
 import { cn } from "@/lib/utils";
 export const ScrollArea = React.forwardRef<
  React.ElementRef<typeof ScrollAreaPrimitive.Root>,
  React.ComponentPropsWithoutRef<typeof ScrollAreaPrimitive.Root>
 >(({ className, children, ...props }, ref) => (
  <ScrollAreaPrimitive.Root ref={ref} className={cn("relative overflow-hidden", className)} {...props}>
    <ScrollAreaPrimitive.Viewport className="h-full w-full rounded-[inherit]">
      {children}
    </ScrollAreaPrimitive.Viewport>
    <ScrollBar />
    <ScrollAreaPrimitive.Corner />
  </ScrollAreaPrimitive.Root>
 ));
 ScrollArea.displayName = "ScrollArea";
 const ScrollBar = React.forwardRef<
  React.ElementRef<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>,
  React.ComponentPropsWithoutRef<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>
 >(({ className, orientation = "vertical", ...props }, ref) => (
  <ScrollAreaPrimitive.ScrollAreaScrollbar
    ref={ref}
    orientation={orientation}
    className={cn(
      "flex touch-none select-none transition-colors",
      orientation === "vertical" ? "h-full w-2 p-0.5" : "h-2 flex-col p-0.5",
      className
    )}
    {...props}
  >
    <ScrollAreaPrimitive.ScrollAreaThumb className="relative flex-1 rounded-full bg-muted-foreground/30 hover:bg-muted-foreground/50 transition-colors" />
  </ScrollAreaPrimitive.ScrollAreaScrollbar>
 ));
 ScrollBar.displayName = "ScrollBar";
--- a/frontend/src/components/ui/select.tsx
+++ b/frontend/src/components/ui/select.tsx
@@ -0,0 +1,89 @@
 import * as React from "react";
 import * as SelectPrimitive from "@radix-ui/react-select";
 import { Check, ChevronDown } from "lucide-react";
 import { cn } from "@/lib/utils";
 export const Select = SelectPrimitive.Root;
 export const SelectGroup = SelectPrimitive.Group;
 export const SelectValue = SelectPrimitive.Value;
 export const SelectTrigger = React.forwardRef<
  React.ElementRef<typeof SelectPrimitive.Trigger>,
  React.ComponentPropsWithoutRef<typeof SelectPrimitive.Trigger>
 >(({ className, children, ...props }, ref) => (
  <SelectPrimitive.Trigger
    ref={ref}
    className={cn(
      "flex h-9 w-full items-center justify-between gap-2 rounded-lg border border-input bg-surface px-3 text-sm shadow-sm transition-colors",
      "ring-focus disabled:cursor-not-allowed disabled:opacity-50",
      "[&>span]:truncate",
      className
    )}
    {...props}
  >
    {children}
    <SelectPrimitive.Icon asChild>
      <ChevronDown className="h-4 w-4 text-muted-foreground" />
    </SelectPrimitive.Icon>
  </SelectPrimitive.Trigger>
 ));
 SelectTrigger.displayName = SelectPrimitive.Trigger.displayName;
 export const SelectContent = React.forwardRef<
  React.ElementRef<typeof SelectPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof SelectPrimitive.Content>
 >(({ className, children, position = "popper", ...props }, ref) => (
  <SelectPrimitive.Portal>
    <SelectPrimitive.Content
      ref={ref}
      position={position}
      className={cn(
        "relative z-50 max-h-72 min-w-[8rem] overflow-hidden rounded-xl border border-border/70 bg-popover text-popover-foreground shadow-elevated",
        "data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95",
        position === "popper" && "translate-y-1",
        className
      )}
      {...props}
    >
      <SelectPrimitive.Viewport className="p-1 max-h-72">
        {children}
      </SelectPrimitive.Viewport>
    </SelectPrimitive.Content>
  </SelectPrimitive.Portal>
 ));
 SelectContent.displayName = SelectPrimitive.Content.displayName;
 export const SelectItem = React.forwardRef<
  React.ElementRef<typeof SelectPrimitive.Item>,
  React.ComponentPropsWithoutRef<typeof SelectPrimitive.Item>
 >(({ className, children, ...props }, ref) => (
  <SelectPrimitive.Item
    ref={ref}
    className={cn(
      "relative flex w-full cursor-pointer select-none items-center rounded-md py-1.5 pl-8 pr-2 text-sm outline-none transition-colors",
      "data-[highlighted]:bg-muted data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
      className
    )}
    {...props}
  >
    <span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
      <SelectPrimitive.ItemIndicator>
        <Check className="h-3.5 w-3.5" />
      </SelectPrimitive.ItemIndicator>
    </span>
    <SelectPrimitive.ItemText>{children}</SelectPrimitive.ItemText>
  </SelectPrimitive.Item>
 ));
 SelectItem.displayName = SelectPrimitive.Item.displayName;
 export const SelectSeparator = React.forwardRef<
  React.ElementRef<typeof SelectPrimitive.Separator>,
  React.ComponentPropsWithoutRef<typeof SelectPrimitive.Separator>
 >(({ className, ...props }, ref) => (
  <SelectPrimitive.Separator
    ref={ref}
    className={cn("-mx-1 my-1 h-px bg-border/70", className)}
    {...props}
  />
 ));
 SelectSeparator.displayName = SelectPrimitive.Separator.displayName;
--- a/frontend/src/components/ui/separator.tsx
+++ b/frontend/src/components/ui/separator.tsx
@@ -0,0 +1,21 @@
 import * as React from "react";
 import * as SeparatorPrimitive from "@radix-ui/react-separator";
 import { cn } from "@/lib/utils";
 export const Separator = React.forwardRef<
  React.ElementRef<typeof SeparatorPrimitive.Root>,
  React.ComponentPropsWithoutRef<typeof SeparatorPrimitive.Root>
 >(({ className, orientation = "horizontal", decorative = true, ...props }, ref) => (
  <SeparatorPrimitive.Root
    ref={ref}
    decorative={decorative}
    orientation={orientation}
    className={cn(
      "shrink-0 bg-border/70",
      orientation === "horizontal" ? "h-px w-full" : "h-full w-px",
      className
    )}
    {...props}
  />
 ));
 Separator.displayName = "Separator";
--- a/frontend/src/components/ui/skeleton.tsx
+++ b/frontend/src/components/ui/skeleton.tsx
@@ -0,0 +1,5 @@
 import { cn } from "@/lib/utils";
 export function Skeleton({ className, ...props }: React.HTMLAttributes<HTMLDivElement>) {
  return <div className={cn("skeleton-shimmer rounded-lg", className)} {...props} />;
 }
--- a/frontend/src/components/ui/switch.tsx
+++ b/frontend/src/components/ui/switch.tsx
@@ -0,0 +1,27 @@
 import * as React from "react";
 import * as SwitchPrimitives from "@radix-ui/react-switch";
 import { cn } from "@/lib/utils";
 export const Switch = React.forwardRef<
  React.ElementRef<typeof SwitchPrimitives.Root>,
  React.ComponentPropsWithoutRef<typeof SwitchPrimitives.Root>
 >(({ className, ...props }, ref) => (
  <SwitchPrimitives.Root
    ref={ref}
    className={cn(
      "peer inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full border border-transparent transition-colors",
      "data-[state=checked]:bg-primary data-[state=unchecked]:bg-muted",
      "ring-focus disabled:cursor-not-allowed disabled:opacity-50",
      className
    )}
    {...props}
  >
    <SwitchPrimitives.Thumb
      className={cn(
        "pointer-events-none block h-4 w-4 rounded-full bg-white shadow-soft transition-transform",
        "data-[state=checked]:translate-x-4 data-[state=unchecked]:translate-x-0.5"
      )}
    />
  </SwitchPrimitives.Root>
 ));
 Switch.displayName = "Switch";
--- a/frontend/src/components/ui/tabs.tsx
+++ b/frontend/src/components/ui/tabs.tsx
@@ -0,0 +1,45 @@
 import * as React from "react";
 import * as TabsPrimitive from "@radix-ui/react-tabs";
 import { cn } from "@/lib/utils";
 export const Tabs = TabsPrimitive.Root;
 export const TabsList = React.forwardRef<
  React.ElementRef<typeof TabsPrimitive.List>,
  React.ComponentPropsWithoutRef<typeof TabsPrimitive.List>
 >(({ className, ...props }, ref) => (
  <TabsPrimitive.List
    ref={ref}
    className={cn(
      "inline-flex h-9 items-center justify-center gap-1 rounded-xl border border-border/70 bg-muted/40 p-1 text-muted-foreground",
      className
    )}
    {...props}
  />
 ));
 TabsList.displayName = TabsPrimitive.List.displayName;
 export const TabsTrigger = React.forwardRef<
  React.ElementRef<typeof TabsPrimitive.Trigger>,
  React.ComponentPropsWithoutRef<typeof TabsPrimitive.Trigger>
 >(({ className, ...props }, ref) => (
  <TabsPrimitive.Trigger
    ref={ref}
    className={cn(
      "inline-flex items-center justify-center gap-1.5 whitespace-nowrap rounded-lg px-3 py-1 text-xs font-medium transition-all",
      "ring-focus disabled:pointer-events-none disabled:opacity-50",
      "data-[state=active]:bg-card data-[state=active]:text-foreground data-[state=active]:shadow-soft",
      className
    )}
    {...props}
  />
 ));
 TabsTrigger.displayName = TabsPrimitive.Trigger.displayName;
 export const TabsContent = React.forwardRef<
  React.ElementRef<typeof TabsPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof TabsPrimitive.Content>
 >(({ className, ...props }, ref) => (
  <TabsPrimitive.Content ref={ref} className={cn("mt-3 ring-focus", className)} {...props} />
 ));
 TabsContent.displayName = TabsPrimitive.Content.displayName;
--- a/frontend/src/components/ui/tooltip.tsx
+++ b/frontend/src/components/ui/tooltip.tsx
@@ -0,0 +1,24 @@
 import * as React from "react";
 import * as TooltipPrimitive from "@radix-ui/react-tooltip";
 import { cn } from "@/lib/utils";
 export const TooltipProvider = TooltipPrimitive.Provider;
 export const Tooltip = TooltipPrimitive.Root;
 export const TooltipTrigger = TooltipPrimitive.Trigger;
 export const TooltipContent = React.forwardRef<
  React.ElementRef<typeof TooltipPrimitive.Content>,
  React.ComponentPropsWithoutRef<typeof TooltipPrimitive.Content>
 >(({ className, sideOffset = 6, ...props }, ref) => (
  <TooltipPrimitive.Content
    ref={ref}
    sideOffset={sideOffset}
    className={cn(
      "z-50 overflow-hidden rounded-md border border-border/70 bg-popover px-2.5 py-1.5 text-xs text-popover-foreground shadow-elevated",
      "animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95",
      className
    )}
    {...props}
  />
 ));
 TooltipContent.displayName = TooltipPrimitive.Content.displayName;
--- a/frontend/src/hooks/useDebounce.ts
+++ b/frontend/src/hooks/useDebounce.ts
@@ -0,0 +1,10 @@
 import { useEffect, useState } from "react";
 export function useDebounce<T>(value: T, delay = 250): T {
  const [debounced, setDebounced] = useState(value);
  useEffect(() => {
    const t = setTimeout(() => setDebounced(value), delay);
    return () => clearTimeout(t);
  }, [value, delay]);
  return debounced;
 }
--- a/frontend/src/hooks/useDocuments.ts
+++ b/frontend/src/hooks/useDocuments.ts
@@ -0,0 +1,27 @@
 import { useQuery, keepPreviousData } from "@tanstack/react-query";
 import { getDashboardStats, getDocument, listDocuments, type DocumentListParams } from "@/services/documents";
 export function useDocuments(params: DocumentListParams) {
  return useQuery({
    queryKey: ["documents", params],
    queryFn: () => listDocuments(params),
    placeholderData: keepPreviousData,
    staleTime: 20_000,
  });
 }
 export function useDocument(id: string | undefined) {
  return useQuery({
    queryKey: ["document", id],
    queryFn: () => getDocument(id!),
    enabled: Boolean(id),
  });
 }
 export function useDashboardStats() {
  return useQuery({
    queryKey: ["dashboard", "stats"],
    queryFn: getDashboardStats,
    refetchInterval: 30_000,
  });
 }
--- a/frontend/src/hooks/useHealth.ts
+++ b/frontend/src/hooks/useHealth.ts
@@ -0,0 +1,19 @@
 import { useQuery } from "@tanstack/react-query";
 import { getHealth, getQueueState } from "@/services/health";
 export function useHealth() {
  return useQuery({
    queryKey: ["health"],
    queryFn: getHealth,
    refetchInterval: 15_000,
    staleTime: 10_000,
  });
 }
 export function useQueue() {
  return useQuery({
    queryKey: ["queue"],
    queryFn: getQueueState,
    refetchInterval: 10_000,
  });
 }
--- a/frontend/src/hooks/useIngestion.ts
+++ b/frontend/src/hooks/useIngestion.ts
@@ -0,0 +1,23 @@
 import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
 import { ingestFolder, listRuns } from "@/services/ingestion";
 import type { IngestFolderRequest } from "@/services/types";
 export function useIngestionRuns() {
  return useQuery({
    queryKey: ["ingestion-runs"],
    queryFn: listRuns,
    refetchInterval: 15_000,
  });
 }
 export function useStartIngestion() {
  const qc = useQueryClient();
  return useMutation({
    mutationFn: (req: IngestFolderRequest) => ingestFolder(req),
    onSuccess: () => {
      qc.invalidateQueries({ queryKey: ["ingestion-runs"] });
      qc.invalidateQueries({ queryKey: ["documents"] });
      qc.invalidateQueries({ queryKey: ["dashboard"] });
    },
  });
 }
--- a/frontend/src/hooks/useQuality.ts
+++ b/frontend/src/hooks/useQuality.ts
@@ -0,0 +1,10 @@
 import { useQuery } from "@tanstack/react-query";
 import { getQualityQueue, type QualityQueueKind } from "@/services/quality";
 export function useQualityQueue(kind: QualityQueueKind) {
  return useQuery({
    queryKey: ["quality", kind],
    queryFn: () => getQualityQueue(kind),
    staleTime: 30_000,
  });
 }
--- a/frontend/src/hooks/useSearch.ts
+++ b/frontend/src/hooks/useSearch.ts
@@ -0,0 +1,32 @@
 import { useQuery } from "@tanstack/react-query";
 import { search, suggest } from "@/services/search";
 import type { SearchFilters, SearchMode } from "@/services/types";
 export function useSearchResults(opts: {
  query: string;
  mode: SearchMode;
  filters: SearchFilters;
  limit: number;
  enabled?: boolean;
 }) {
  return useQuery({
    queryKey: ["search", opts.query, opts.mode, opts.filters, opts.limit],
    queryFn: () =>
      search({
        query: opts.query,
        limit: opts.limit,
        filters: opts.filters,
        search_mode: opts.mode,
      }),
    enabled: opts.enabled !== false && opts.query.trim().length > 0,
    staleTime: 60_000,
  });
 }
 export function useSuggestions(query: string) {
  return useQuery({
    queryKey: ["search-suggest", query],
    queryFn: () => suggest(query),
    staleTime: 30_000,
  });
 }
--- a/frontend/src/hooks/useTheme.ts
+++ b/frontend/src/hooks/useTheme.ts
@@ -0,0 +1,15 @@
 import { useEffect } from "react";
 import { applyTheme, useUiStore } from "@/stores/uiStore";
 export function useThemeBootstrap() {
  const theme = useUiStore((s) => s.theme);
  useEffect(() => {
    applyTheme(theme);
    if (theme !== "system") return;
    const mq = window.matchMedia("(prefers-color-scheme: dark)");
    const fn = () => applyTheme("system");
    mq.addEventListener("change", fn);
    return () => mq.removeEventListener("change", fn);
  }, [theme]);
 }
--- a/frontend/src/layouts/AppShell.tsx
+++ b/frontend/src/layouts/AppShell.tsx
@@ -0,0 +1,30 @@
 import { Outlet } from "react-router-dom";
 import { Sidebar } from "@/layouts/Sidebar";
 import { Topbar } from "@/layouts/Topbar";
 import { CommandPalette } from "@/layouts/CommandPalette";
 import { useThemeBootstrap } from "@/hooks/useTheme";
 export function AppShell() {
  useThemeBootstrap();
  return (
    <div className="relative flex min-h-screen bg-background">
      {/* Soft ambient backdrop */}
      <div
        aria-hidden
        className="pointer-events-none fixed inset-0 -z-10 opacity-[0.45] dark:opacity-30"
        style={{
          background:
            "radial-gradient(60% 50% at 18% 14%, hsl(var(--primary) / 0.16), transparent 70%), radial-gradient(40% 30% at 90% 0%, hsl(var(--primary) / 0.10), transparent 60%)",
        }}
      />
      <Sidebar />
      <div className="flex min-w-0 flex-1 flex-col">
        <Topbar />
        <main className="relative flex min-w-0 flex-1 flex-col gap-6 px-4 py-6 lg:px-8 lg:py-8 2xl:px-12">
          <Outlet />
        </main>
      </div>
      <CommandPalette />
    </div>
  );
 }
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`"""LegacyHUB - knowledge indexing and hybrid search over legacy PDF archives."""`

							`__version__ = "0.1.0"`
		`@@ -0,0 +1,3 @@`
							`from app.db.models import Base`

							`__all__ = ["Base"]`
		`@@ -0,0 +1,3 @@`
							`from app.storage.minio_client import MinioStorage, get_storage`

							`__all__ = ["MinioStorage", "get_storage"]`