chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

3
app/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""LegacyHUB - knowledge indexing and hybrid search over legacy PDF archives."""
__version__ = "0.1.0"

0
app/api/__init__.py Normal file
View File

96
app/api/routes_health.py Normal file
View File

@@ -0,0 +1,96 @@
"""Health endpoint - probes Postgres, MinIO, OpenSearch, Qdrant, Redis."""
from __future__ import annotations
from typing import Any
from fastapi import APIRouter
from sqlalchemy import text
from app import __version__
from app.api.schemas import ComponentHealth, HealthResponse
from app.config import settings
from app.db.session import get_engine
from app.logging_config import get_logger
from app.storage.minio_client import get_storage
logger = get_logger(__name__)
router = APIRouter(tags=["health"])
def _check_postgres() -> ComponentHealth:
try:
with get_engine().connect() as conn:
conn.execute(text("SELECT 1"))
return ComponentHealth(name="postgres", status="ok")
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="postgres", status="error", detail={"error": str(exc)})
def _check_minio() -> ComponentHealth:
info: dict[str, Any] = get_storage().health()
if info.get("status") == "ok":
return ComponentHealth(name="minio", status="ok", detail=info)
return ComponentHealth(name="minio", status="error", detail=info)
def _check_opensearch() -> ComponentHealth:
try:
from app.indexing.opensearch_client import get_opensearch
client = get_opensearch()
info = client.cluster.health()
cluster_status = info.get("status")
status = "ok" if cluster_status in ("green", "yellow") else "degraded"
return ComponentHealth(
name="opensearch",
status=status, # type: ignore[arg-type]
detail={"cluster_status": cluster_status, "nodes": info.get("number_of_nodes")},
)
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="opensearch", status="error", detail={"error": str(exc)})
def _check_qdrant() -> ComponentHealth:
try:
from app.indexing.qdrant_client import get_qdrant
client = get_qdrant()
cols = client.get_collections()
return ComponentHealth(
name="qdrant",
status="ok",
detail={"collections": [c.name for c in cols.collections]},
)
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="qdrant", status="error", detail={"error": str(exc)})
def _check_redis() -> ComponentHealth:
try:
import redis
r = redis.Redis.from_url(settings.redis_url, socket_connect_timeout=2)
r.ping()
return ComponentHealth(name="redis", status="ok")
except Exception as exc: # noqa: BLE001
return ComponentHealth(name="redis", status="error", detail={"error": str(exc)})
@router.get("/health", response_model=HealthResponse)
def health() -> HealthResponse:
components = [
_check_postgres(),
_check_minio(),
_check_opensearch(),
_check_qdrant(),
_check_redis(),
]
if any(c.status == "error" for c in components):
overall = "error"
elif any(c.status == "degraded" for c in components):
overall = "degraded"
else:
overall = "ok"
return HealthResponse(status=overall, version=__version__, components=components) # type: ignore[arg-type]

View File

@@ -0,0 +1,63 @@
"""Ingestion endpoints."""
from __future__ import annotations
import uuid
from pathlib import Path
from fastapi import APIRouter, HTTPException
from app.api.schemas import IngestFolderRequest, IngestFolderResponse
from app.logging_config import get_logger
logger = get_logger(__name__)
router = APIRouter(prefix="/ingest", tags=["ingestion"])
@router.post("/folder", response_model=IngestFolderResponse)
def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
"""Discover all PDFs under ``path`` and queue them for processing.
The request returns immediately after the discovery pass. Per-document
OCR / extraction / indexing happens asynchronously in Celery workers.
"""
folder = Path(req.path)
if not folder.exists() or not folder.is_dir():
raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
# Lazy import - keeps module load light.
from app.ingestion.scanner import discover_documents
from app.workers.tasks import process_document
run_id = uuid.uuid4()
discovered, queued, dups, invalid = 0, 0, 0, 0
for record in discover_documents(folder, recursive=req.recursive, force=req.force):
discovered += 1
if record.duplicate and not req.force:
dups += 1
continue
if not record.document_id:
invalid += 1
continue
process_document.delay(str(record.document_id), str(run_id))
queued += 1
logger.info(
"ingest.folder.queued",
path=str(folder),
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid=invalid,
run_id=str(run_id),
)
return IngestFolderResponse(
run_id=run_id,
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid_files=invalid,
)

16
app/api/routes_search.py Normal file
View File

@@ -0,0 +1,16 @@
"""Search endpoint - lexical / semantic / hybrid."""
from __future__ import annotations
from fastapi import APIRouter
from app.api.schemas import SearchRequest, SearchResponse
router = APIRouter(prefix="/search", tags=["search"])
@router.post("", response_model=SearchResponse)
def search(req: SearchRequest) -> SearchResponse:
from app.indexing.hybrid_search import run_search
return run_search(req)

99
app/api/schemas.py Normal file
View File

@@ -0,0 +1,99 @@
"""Pydantic request/response schemas for the LegacyHUB API."""
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Any, Literal
from pydantic import BaseModel, Field
# ---------------- Health ----------------
class ComponentHealth(BaseModel):
name: str
status: Literal["ok", "error", "degraded"]
detail: dict[str, Any] = Field(default_factory=dict)
class HealthResponse(BaseModel):
status: Literal["ok", "error", "degraded"]
version: str
components: list[ComponentHealth]
# ---------------- Ingestion ----------------
class IngestFolderRequest(BaseModel):
path: str = Field(..., description="Absolute path inside the API container")
recursive: bool = True
force: bool = False
class IngestFolderResponse(BaseModel):
run_id: uuid.UUID
discovered: int
queued: int
skipped_duplicates: int
invalid_files: int
class DocumentSummary(BaseModel):
id: uuid.UUID
original_file_name: str
source_path: str
sha256: str
status: str
file_size_bytes: int
created_at: datetime
# ---------------- Search ----------------
SearchMode = Literal["lexical", "semantic", "hybrid"]
class SearchFilters(BaseModel):
document_id: uuid.UUID | None = None
source_path: str | None = None
block_type: str | None = None
min_ocr_confidence: float | None = Field(None, ge=0.0, le=1.0)
class SearchRequest(BaseModel):
query: str = Field(..., min_length=1)
limit: int = Field(10, ge=1, le=100)
filters: SearchFilters = Field(default_factory=SearchFilters)
search_mode: SearchMode = "hybrid"
class Citation(BaseModel):
pdf: str
page: int
block_id: str | None = None
table_id: str | None = None
figure_id: str | None = None
class SearchHit(BaseModel):
rank: int
score: float
document_id: uuid.UUID
chunk_id: uuid.UUID
original_file_name: str
source_path: str
page_number: int
block_type: str
text: str
citation: Citation
quality_flags: dict[str, Any] = Field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)
class SearchResponse(BaseModel):
query: str
mode: SearchMode
total_candidates: int
reranked: bool
results: list[SearchHit]

111
app/config.py Normal file
View File

@@ -0,0 +1,111 @@
"""Centralized typed configuration loaded from environment variables.
All other modules import :data:`settings` and never touch ``os.environ`` directly.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ---------------- App ----------------
app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
app_host: str = Field("0.0.0.0", alias="APP_HOST")
app_port: int = Field(8000, alias="APP_PORT")
app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
# ---------------- Postgres ----------------
postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
postgres_port: int = Field(5432, alias="POSTGRES_PORT")
postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
@property
def database_url(self) -> str:
return (
f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
)
# ---------------- MinIO ----------------
minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
minio_secure: bool = Field(False, alias="MINIO_SECURE")
minio_region: str = Field("us-east-1", alias="MINIO_REGION")
# ---------------- OpenSearch ----------------
opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
opensearch_user: str = Field("", alias="OPENSEARCH_USER")
opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
# ---------------- Qdrant ----------------
qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
qdrant_port: int = Field(6333, alias="QDRANT_PORT")
qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
# ---------------- Redis ----------------
redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
# ---------------- OCR ----------------
ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
ocr_clean: bool = Field(True, alias="OCR_CLEAN")
ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
# ---------------- Embeddings / Reranker ----------------
embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
# ---------------- Chunking ----------------
chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
# ---------------- Hybrid search ----------------
hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
@lru_cache(maxsize=1)
def get_settings() -> Settings:
return Settings() # type: ignore[call-arg]
settings = get_settings()

3
app/db/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from app.db.models import Base
__all__ = ["Base"]

55
app/db/migrations/env.py Normal file
View File

@@ -0,0 +1,55 @@
"""Alembic environment - online & offline migrations using app config."""
from __future__ import annotations
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
from app.config import settings
from app.db.models import Base
config = context.config
config.set_main_option("sqlalchemy.url", settings.database_url)
if config.config_file_name is not None:
fileConfig(config.config_file_name)
target_metadata = Base.metadata
def run_migrations_offline() -> None:
context.configure(
url=settings.database_url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
section = config.get_section(config.config_ini_section, {})
section["sqlalchemy.url"] = settings.database_url
connectable = engine_from_config(
section,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@@ -0,0 +1,27 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from __future__ import annotations
from collections.abc import Sequence
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: str | None = ${repr(down_revision)}
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,171 @@
"""initial schema
Revision ID: 0001_initial
Revises:
Create Date: 2026-05-10
"""
from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0001_initial"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"documents",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("source_path", sa.Text, nullable=False),
sa.Column("original_file_name", sa.Text, nullable=False),
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
sa.Column("language_hint", sa.Text, nullable=True),
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
sa.Column("error_message", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_documents_status", "documents", ["status"])
op.create_index("ix_documents_sha256", "documents", ["sha256"])
op.create_table(
"document_artifacts",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("artifact_type", sa.String(64), nullable=False),
sa.Column("storage_bucket", sa.Text, nullable=False),
sa.Column("storage_key", sa.Text, nullable=False),
sa.Column("page_number", sa.Integer, nullable=True),
sa.Column("checksum", sa.String(64), nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
op.create_table(
"pages",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("text", sa.Text, nullable=False, server_default=""),
sa.Column("ocr_confidence", sa.Float, nullable=True),
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
)
op.create_table(
"chunks",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("block_id", sa.Text, nullable=True),
sa.Column("chunk_index", sa.Integer, nullable=False),
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
sa.Column("text", sa.Text, nullable=False),
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
sa.Column("token_count", sa.Integer, nullable=True),
sa.Column("ocr_confidence", sa.Float, nullable=True),
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
)
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
op.create_table(
"tables",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("table_index", sa.Integer, nullable=False),
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
sa.Column("csv_text", sa.Text, nullable=True),
sa.Column("json_data", postgresql.JSONB, nullable=True),
sa.Column("summary", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
)
op.create_table(
"figures",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("figure_index", sa.Integer, nullable=False),
sa.Column("caption", sa.Text, nullable=True),
sa.Column("description", sa.Text, nullable=True),
sa.Column("storage_bucket", sa.Text, nullable=True),
sa.Column("storage_key", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
)
op.create_table(
"ingestion_runs",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
sa.Column("source_folder", sa.Text, nullable=False),
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
)
op.create_table(
"processing_events",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("stage", sa.String(64), nullable=False),
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
sa.Column("message", sa.Text, nullable=False),
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_events_doc", "processing_events", ["document_id"])
op.create_index("ix_events_run", "processing_events", ["run_id"])
op.create_index("ix_events_stage", "processing_events", ["stage"])
def downgrade() -> None:
op.drop_index("ix_events_stage", table_name="processing_events")
op.drop_index("ix_events_run", table_name="processing_events")
op.drop_index("ix_events_doc", table_name="processing_events")
op.drop_table("processing_events")
op.drop_table("ingestion_runs")
op.drop_table("figures")
op.drop_table("tables")
op.drop_index("ix_chunks_block_type", table_name="chunks")
op.drop_index("ix_chunks_doc_page", table_name="chunks")
op.drop_table("chunks")
op.drop_table("pages")
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
op.drop_table("document_artifacts")
op.drop_index("ix_documents_sha256", table_name="documents")
op.drop_index("ix_documents_status", table_name="documents")
op.drop_table("documents")

266
app/db/models.py Normal file
View File

@@ -0,0 +1,266 @@
"""SQLAlchemy ORM models for LegacyHUB."""
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Any
from sqlalchemy import (
BigInteger,
Boolean,
DateTime,
Float,
ForeignKey,
Index,
Integer,
String,
Text,
UniqueConstraint,
func,
)
from sqlalchemy.dialects.postgresql import JSONB, UUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
pass
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
class DocumentStatus:
DISCOVERED = "DISCOVERED"
STORED_ORIGINAL = "STORED_ORIGINAL"
OCR_STARTED = "OCR_STARTED"
OCR_COMPLETED = "OCR_COMPLETED"
OCR_FAILED = "OCR_FAILED"
EXTRACTION_STARTED = "EXTRACTION_STARTED"
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
EXTRACTION_FAILED = "EXTRACTION_FAILED"
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
INDEXING_COMPLETED = "INDEXING_COMPLETED"
FAILED = "FAILED"
class ArtifactType:
ORIGINAL_PDF = "original_pdf"
OCR_PDF = "ocr_pdf"
DOCLING_JSON = "docling_json"
MARKDOWN = "markdown"
PAGE_IMAGE = "page_image"
FIGURE_CROP = "figure_crop"
TABLE_JSON = "table_json"
class BlockType:
TITLE = "title"
HEADING = "heading"
PARAGRAPH = "paragraph"
LIST = "list"
TABLE = "table"
FIGURE_CAPTION = "figure_caption"
FIGURE_DESCRIPTION = "figure_description"
HANDWRITING = "handwriting"
UNKNOWN = "unknown"
# ---- Tables ----
class Document(Base):
__tablename__ = "documents"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
source_path: Mapped[str] = mapped_column(Text, nullable=False)
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
status: Mapped[str] = mapped_column(
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
)
artifacts: Mapped[list[DocumentArtifact]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
pages: Mapped[list[Page]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
chunks: Mapped[list[Chunk]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
class DocumentArtifact(Base):
__tablename__ = "document_artifacts"
__table_args__ = (
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="artifacts")
class Page(Base):
__tablename__ = "pages"
__table_args__ = (
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="pages")
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
class Chunk(Base):
__tablename__ = "chunks"
__table_args__ = (
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
Index("ix_chunks_doc_page", "document_id", "page_number"),
Index("ix_chunks_block_type", "block_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
text: Mapped[str] = mapped_column(Text, nullable=False)
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, nullable=False, default=dict
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="chunks")
page: Mapped[Page | None] = relationship(back_populates="chunks")
class Table(Base):
__tablename__ = "tables"
__table_args__ = (
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
class Figure(Base):
__tablename__ = "figures"
__table_args__ = (
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
class IngestionRun(Base):
__tablename__ = "ingestion_runs"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
started_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
run_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, nullable=False, default=dict
)
class ProcessingEvent(Base):
__tablename__ = "processing_events"
__table_args__ = (
Index("ix_events_doc", "document_id"),
Index("ix_events_run", "run_id"),
Index("ix_events_stage", "stage"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
stage: Mapped[str] = mapped_column(String(64), nullable=False)
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
message: Mapped[str] = mapped_column(Text, nullable=False)
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)

66
app/db/session.py Normal file
View File

@@ -0,0 +1,66 @@
"""SQLAlchemy engine and session factory."""
from __future__ import annotations
from collections.abc import Iterator
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session, sessionmaker
from app.config import settings
_engine: Engine | None = None
_SessionFactory: sessionmaker[Session] | None = None
def get_engine() -> Engine:
global _engine
if _engine is None:
_engine = create_engine(
settings.database_url,
pool_pre_ping=True,
pool_size=10,
max_overflow=20,
future=True,
)
return _engine
def get_session_factory() -> sessionmaker[Session]:
global _SessionFactory
if _SessionFactory is None:
_SessionFactory = sessionmaker(
bind=get_engine(),
autoflush=False,
autocommit=False,
expire_on_commit=False,
future=True,
)
return _SessionFactory
@contextmanager
def session_scope() -> Iterator[Session]:
"""Provide a transactional scope: commits on success, rolls back on error."""
factory = get_session_factory()
session = factory()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
def get_db() -> Iterator[Session]:
"""FastAPI dependency."""
factory = get_session_factory()
session = factory()
try:
yield session
finally:
session.close()

0
app/indexing/__init__.py Normal file
View File

View File

@@ -0,0 +1,90 @@
"""BGE-M3 dense embedder with batching and CPU/GPU support.
We prefer FlagEmbedding's ``BGEM3FlagModel`` because it is the canonical
implementation and supports dense + sparse output. We fall back to
``sentence-transformers`` for portability.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Sequence
import numpy as np
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
class Embedder:
def __init__(self, model_name: str, device: str, normalize: bool, batch_size: int) -> None:
self.model_name = model_name
self.device = device
self.normalize = normalize
self.batch_size = batch_size
self._impl = "flagembedding"
self._model = None
self._st_model = None
self._load()
def _load(self) -> None:
try:
from FlagEmbedding import BGEM3FlagModel # type: ignore
use_fp16 = self.device != "cpu"
self._model = BGEM3FlagModel(self.model_name, use_fp16=use_fp16, devices=self.device)
self._impl = "flagembedding"
logger.info("embedder.loaded", impl="flagembedding", model=self.model_name, device=self.device)
return
except Exception as exc: # noqa: BLE001
logger.warning("embedder.flagembedding_failed", error=str(exc))
from sentence_transformers import SentenceTransformer
self._st_model = SentenceTransformer(self.model_name, device=self.device)
self._impl = "sentence-transformers"
logger.info("embedder.loaded", impl="sentence-transformers", model=self.model_name, device=self.device)
def encode(self, texts: Sequence[str]) -> list[list[float]]:
if not texts:
return []
if self._impl == "flagembedding":
out = self._model.encode( # type: ignore[union-attr]
list(texts),
batch_size=self.batch_size,
max_length=8192,
return_dense=True,
return_sparse=False,
return_colbert_vecs=False,
)
dense = out["dense_vecs"] if isinstance(out, dict) else out
arr = np.asarray(dense, dtype=np.float32)
else:
arr = self._st_model.encode( # type: ignore[union-attr]
list(texts),
batch_size=self.batch_size,
normalize_embeddings=self.normalize,
convert_to_numpy=True,
show_progress_bar=False,
)
arr = arr.astype(np.float32)
if self.normalize and self._impl == "flagembedding":
norms = np.linalg.norm(arr, axis=1, keepdims=True)
norms[norms == 0] = 1.0
arr = arr / norms
return arr.tolist()
def encode_one(self, text: str) -> list[float]:
return self.encode([text])[0]
@lru_cache(maxsize=1)
def get_embedder() -> Embedder:
return Embedder(
model_name=settings.embedding_model,
device=settings.embedding_device,
normalize=settings.embedding_normalize,
batch_size=settings.embedding_batch_size,
)

View File

@@ -0,0 +1,327 @@
"""Hybrid search: lexical (OpenSearch BM25) + semantic (Qdrant) + RRF + reranker.
Always returns ``SearchResponse`` (never throws on missing index/collection -
empty results are valid).
"""
from __future__ import annotations
import uuid
from collections import defaultdict
from dataclasses import dataclass
from typing import Any
from qdrant_client.http import models as qm
from app.api.schemas import (
Citation,
SearchFilters,
SearchHit,
SearchMode,
SearchRequest,
SearchResponse,
)
from app.config import settings
from app.indexing.embeddings import get_embedder
from app.indexing.opensearch_client import get_opensearch
from app.indexing.qdrant_client import DENSE_VECTOR_NAME, get_qdrant
from app.indexing.reranker import get_reranker
from app.logging_config import get_logger
from app.utils.text_cleaning import normalize_for_search
logger = get_logger(__name__)
@dataclass
class _Candidate:
chunk_id: str
document_id: str
page_number: int
block_type: str
block_id: str | None
text: str
source_path: str
original_file_name: str
quality_flags: dict[str, Any]
metadata: dict[str, Any]
bm25_score: float | None = None
bm25_rank: int | None = None
dense_score: float | None = None
dense_rank: int | None = None
def run_search(req: SearchRequest) -> SearchResponse:
mode: SearchMode = req.search_mode
filters = req.filters
lexical: list[_Candidate] = []
semantic: list[_Candidate] = []
if mode in ("lexical", "hybrid"):
try:
lexical = _lexical_search(req.query, filters, settings.hybrid_opensearch_top_k)
except Exception as exc: # noqa: BLE001
logger.warning("search.lexical_failed", error=str(exc))
if mode in ("semantic", "hybrid"):
try:
semantic = _semantic_search(req.query, filters, settings.hybrid_qdrant_top_k)
except Exception as exc: # noqa: BLE001
logger.warning("search.semantic_failed", error=str(exc))
merged = _merge(lexical, semantic, mode)
candidates = merged[: settings.rerank_candidates]
reranker = get_reranker()
reranked_flag = False
if settings.reranker_enabled and reranker.available and candidates:
scores = reranker.score(req.query, [c.text for c in candidates])
for c, s in zip(candidates, scores, strict=True):
c.dense_score = s
candidates.sort(key=lambda c: (c.dense_score or 0.0), reverse=True)
reranked_flag = True
final = candidates[: req.limit]
hits: list[SearchHit] = []
for rank, c in enumerate(final, start=1):
score = (
c.dense_score
if reranked_flag
else (c.dense_score if mode == "semantic" else c.bm25_score) or 0.0
)
hits.append(
SearchHit(
rank=rank,
score=float(score),
document_id=uuid.UUID(c.document_id),
chunk_id=uuid.UUID(c.chunk_id),
original_file_name=c.original_file_name,
source_path=c.source_path,
page_number=c.page_number,
block_type=c.block_type,
text=c.text,
citation=Citation(
pdf=c.original_file_name,
page=c.page_number,
block_id=c.block_id,
table_id=str(c.metadata.get("table_index")) if c.metadata.get("table_index") is not None else None,
figure_id=str(c.metadata.get("figure_index")) if c.metadata.get("figure_index") is not None else None,
),
quality_flags=c.quality_flags,
metadata=c.metadata,
)
)
return SearchResponse(
query=req.query,
mode=mode,
total_candidates=len(merged),
reranked=reranked_flag,
results=hits,
)
# ---------------- lexical ----------------
def _lexical_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
client = get_opensearch()
if not client.indices.exists(index=settings.opensearch_index_chunks):
return []
must = [
{
"multi_match": {
"query": query,
"fields": ["text^1.0", "text.ru^1.5", "text.en^1.5", "normalized_text^0.7"],
"type": "best_fields",
"operator": "or",
}
}
]
norm = normalize_for_search(query)
if norm and norm != query.lower():
must.append({"match": {"normalized_text": {"query": norm, "boost": 0.5}}})
filter_clauses = _opensearch_filters(filters)
body = {
"size": top_k,
"query": {"bool": {"must": must, "filter": filter_clauses}},
"_source": [
"chunk_id",
"document_id",
"source_path",
"original_file_name",
"page_number",
"block_type",
"block_id",
"text",
"quality_flags",
"metadata",
],
}
res = client.search(index=settings.opensearch_index_chunks, body=body, request_timeout=30)
out: list[_Candidate] = []
for rank, hit in enumerate(res.get("hits", {}).get("hits", []), start=1):
s = hit.get("_source", {})
out.append(
_Candidate(
chunk_id=s["chunk_id"],
document_id=s["document_id"],
page_number=int(s.get("page_number", 0)),
block_type=s.get("block_type", "paragraph"),
block_id=s.get("block_id"),
text=s.get("text", ""),
source_path=s.get("source_path", ""),
original_file_name=s.get("original_file_name", ""),
quality_flags=s.get("quality_flags") or {},
metadata=s.get("metadata") or {},
bm25_score=float(hit.get("_score") or 0.0),
bm25_rank=rank,
)
)
return out
def _opensearch_filters(filters: SearchFilters) -> list[dict[str, Any]]:
clauses: list[dict[str, Any]] = []
if filters.document_id:
clauses.append({"term": {"document_id": str(filters.document_id)}})
if filters.source_path:
clauses.append({"term": {"source_path": filters.source_path}})
if filters.block_type:
clauses.append({"term": {"block_type": filters.block_type}})
if filters.min_ocr_confidence is not None:
clauses.append({"range": {"ocr_confidence": {"gte": filters.min_ocr_confidence}}})
return clauses
# ---------------- semantic ----------------
def _semantic_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
embedder = get_embedder()
vector = embedder.encode_one(query)
qf = _qdrant_filter(filters)
client = get_qdrant()
try:
results = client.query_points(
collection_name=settings.qdrant_collection_chunks,
query=vector,
using=DENSE_VECTOR_NAME,
limit=top_k,
with_payload=True,
query_filter=qf,
).points
except Exception as exc: # noqa: BLE001
logger.debug("qdrant.query_points_fallback", error=str(exc))
results = client.search(
collection_name=settings.qdrant_collection_chunks,
query_vector=(DENSE_VECTOR_NAME, vector),
query_filter=qf,
limit=top_k,
with_payload=True,
)
out: list[_Candidate] = []
for rank, p in enumerate(results, start=1):
payload = p.payload or {}
chunk_id = payload.get("chunk_id") or str(p.id)
out.append(
_Candidate(
chunk_id=str(chunk_id),
document_id=str(payload.get("document_id", "")),
page_number=int(payload.get("page_number") or 0),
block_type=payload.get("block_type", "paragraph"),
block_id=payload.get("block_id"),
text=payload.get("text_preview", ""),
source_path=payload.get("source_path", ""),
original_file_name=payload.get("original_file_name", ""),
quality_flags=payload.get("quality_flags") or {},
metadata=payload.get("metadata") or {},
dense_score=float(p.score or 0.0),
dense_rank=rank,
)
)
return out
def _qdrant_filter(filters: SearchFilters) -> qm.Filter | None:
must: list[qm.FieldCondition | qm.Range] = []
if filters.document_id:
must.append(qm.FieldCondition(key="document_id", match=qm.MatchValue(value=str(filters.document_id))))
if filters.source_path:
must.append(qm.FieldCondition(key="source_path", match=qm.MatchValue(value=filters.source_path)))
if filters.block_type:
must.append(qm.FieldCondition(key="block_type", match=qm.MatchValue(value=filters.block_type)))
if filters.min_ocr_confidence is not None:
must.append(qm.FieldCondition(key="ocr_confidence", range=qm.Range(gte=filters.min_ocr_confidence)))
if not must:
return None
return qm.Filter(must=must)
# ---------------- merge ----------------
def _merge(lexical: list[_Candidate], semantic: list[_Candidate], mode: SearchMode) -> list[_Candidate]:
if mode == "lexical":
return lexical
if mode == "semantic":
return _hydrate_semantic_text(semantic)
by_id: dict[str, _Candidate] = {}
for c in lexical:
by_id[c.chunk_id] = c
for c in semantic:
if c.chunk_id in by_id:
by_id[c.chunk_id].dense_score = c.dense_score
by_id[c.chunk_id].dense_rank = c.dense_rank
if not by_id[c.chunk_id].text:
by_id[c.chunk_id].text = c.text
else:
by_id[c.chunk_id] = c
rrf: dict[str, float] = defaultdict(float)
k = settings.hybrid_rrf_k
for c in lexical:
if c.bm25_rank is not None:
rrf[c.chunk_id] += 1.0 / (k + c.bm25_rank)
for c in semantic:
if c.dense_rank is not None:
rrf[c.chunk_id] += 1.0 / (k + c.dense_rank)
items = sorted(by_id.values(), key=lambda c: rrf.get(c.chunk_id, 0.0), reverse=True)
return _hydrate_full_text(items)
def _hydrate_full_text(candidates: list[_Candidate]) -> list[_Candidate]:
"""For candidates whose text came only from Qdrant payload (preview), pull
the full chunk text from OpenSearch by id so the reranker sees full content.
"""
missing = [c for c in candidates if len(c.text) <= 512]
if not missing:
return candidates
client = get_opensearch()
ids = [c.chunk_id for c in missing]
try:
res = client.mget(index=settings.opensearch_index_chunks, body={"ids": ids})
except Exception:
return candidates
by_id = {d["_id"]: d.get("_source", {}) for d in res.get("docs", []) if d.get("found")}
for c in missing:
s = by_id.get(c.chunk_id)
if s and s.get("text"):
c.text = s["text"]
if not c.original_file_name:
c.original_file_name = s.get("original_file_name", "")
if not c.source_path:
c.source_path = s.get("source_path", "")
if not c.metadata:
c.metadata = s.get("metadata") or {}
if not c.quality_flags:
c.quality_flags = s.get("quality_flags") or {}
return candidates
def _hydrate_semantic_text(candidates: list[_Candidate]) -> list[_Candidate]:
return _hydrate_full_text(candidates)

View File

@@ -0,0 +1,142 @@
"""OpenSearch client + index bootstrap + chunk indexing helpers."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Iterable
from opensearchpy import OpenSearch, RequestsHttpConnection
from opensearchpy.helpers import bulk
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
# Index settings: 3 analyzers (russian, english, standard).
# We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
INDEX_SETTINGS: dict[str, Any] = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"ru_stop": {"type": "stop", "stopwords": "_russian_"},
"ru_stemmer": {"type": "stemmer", "language": "russian"},
"en_stop": {"type": "stop", "stopwords": "_english_"},
"en_stemmer": {"type": "stemmer", "language": "english"},
},
"analyzer": {
"ru_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "ru_stop", "ru_stemmer"],
},
"en_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "en_stop", "en_stemmer"],
},
"code_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"],
},
},
},
},
"mappings": {
"dynamic": "strict",
"properties": {
"chunk_id": {"type": "keyword"},
"document_id": {"type": "keyword"},
"source_path": {"type": "keyword"},
"original_file_name": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
},
"page_number": {"type": "integer"},
"block_type": {"type": "keyword"},
"block_id": {"type": "keyword"},
"text": {
"type": "text",
"analyzer": "code_analyzer",
"fields": {
"ru": {"type": "text", "analyzer": "ru_analyzer"},
"en": {"type": "text", "analyzer": "en_analyzer"},
},
},
"normalized_text": {
"type": "text",
"analyzer": "code_analyzer",
},
"ocr_confidence": {"type": "float"},
"language_hint": {"type": "keyword"},
"metadata": {"type": "object", "enabled": True},
"quality_flags": {"type": "object", "enabled": True},
"created_at": {"type": "date"},
},
},
}
@lru_cache(maxsize=1)
def get_opensearch() -> OpenSearch:
auth = None
if settings.opensearch_user and settings.opensearch_password:
auth = (settings.opensearch_user, settings.opensearch_password)
return OpenSearch(
hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
http_auth=auth,
use_ssl=settings.opensearch_use_ssl,
verify_certs=settings.opensearch_verify_certs,
ssl_show_warn=False,
connection_class=RequestsHttpConnection,
timeout=30,
max_retries=3,
retry_on_timeout=True,
)
def ensure_index(index: str | None = None) -> None:
name = index or settings.opensearch_index_chunks
client = get_opensearch()
if client.indices.exists(index=name):
logger.debug("opensearch.index.exists", index=name)
return
logger.info("opensearch.index.create", index=name)
client.indices.create(index=name, body=INDEX_SETTINGS)
def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
"""Bulk-upsert chunks. Returns (success, errors)."""
name = index or settings.opensearch_index_chunks
actions: list[dict[str, Any]] = []
for d in docs:
actions.append(
{
"_op_type": "index",
"_index": name,
"_id": d["chunk_id"],
"_source": d,
}
)
if not actions:
return 0, 0
success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
if errors:
logger.warning("opensearch.bulk.errors", count=len(errors))
return success, len(errors) if isinstance(errors, list) else 0
def delete_by_document(document_id: str, index: str | None = None) -> int:
name = index or settings.opensearch_index_chunks
client = get_opensearch()
if not client.indices.exists(index=name):
return 0
res = client.delete_by_query(
index=name,
body={"query": {"term": {"document_id": document_id}}},
refresh=True,
)
return int(res.get("deleted", 0))

View File

@@ -0,0 +1,103 @@
"""Qdrant client + collection bootstrap + chunk upsert."""
from __future__ import annotations
from functools import lru_cache
from typing import Any, Sequence
from qdrant_client import QdrantClient
from qdrant_client.http import models as qm
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
DENSE_VECTOR_NAME = "dense"
@lru_cache(maxsize=1)
def get_qdrant() -> QdrantClient:
return QdrantClient(
host=settings.qdrant_host,
port=settings.qdrant_port,
api_key=settings.qdrant_api_key or None,
timeout=60,
)
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
name = collection or settings.qdrant_collection_chunks
vector_size = dim or settings.embedding_dim
client = get_qdrant()
existing = {c.name for c in client.get_collections().collections}
if name in existing:
logger.debug("qdrant.collection.exists", collection=name)
return
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
client.create_collection(
collection_name=name,
vectors_config={
DENSE_VECTOR_NAME: qm.VectorParams(
size=vector_size,
distance=qm.Distance.COSINE,
)
},
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
)
# Payload indexes for filtering.
for field in ("document_id", "source_path", "block_type"):
client.create_payload_index(
collection_name=name,
field_name=field,
field_schema=qm.PayloadSchemaType.KEYWORD,
)
client.create_payload_index(
collection_name=name,
field_name="page_number",
field_schema=qm.PayloadSchemaType.INTEGER,
)
client.create_payload_index(
collection_name=name,
field_name="ocr_confidence",
field_schema=qm.PayloadSchemaType.FLOAT,
)
def upsert_chunks(
points: Sequence[tuple[str, list[float], dict[str, Any]]],
collection: str | None = None,
) -> int:
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
name = collection or settings.qdrant_collection_chunks
if not points:
return 0
qpoints = [
qm.PointStruct(
id=_qid(chunk_id),
vector={DENSE_VECTOR_NAME: vector},
payload={**payload, "chunk_id": chunk_id},
)
for chunk_id, vector, payload in points
]
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
return len(qpoints)
def delete_by_document(document_id: str, collection: str | None = None) -> int:
name = collection or settings.qdrant_collection_chunks
client = get_qdrant()
client.delete(
collection_name=name,
points_selector=qm.FilterSelector(
filter=qm.Filter(
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
)
),
)
return 1
def _qid(chunk_id: str) -> str:
"""Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
return chunk_id

75
app/indexing/reranker.py Normal file
View File

@@ -0,0 +1,75 @@
"""BGE reranker - cross-encoder style scoring of (query, passage) pairs.
Designed to degrade gracefully:
- If the model fails to load, ``rerank`` returns inputs unchanged with the
``reranked`` flag set to False so the API can report the truth to clients.
"""
from __future__ import annotations
from functools import lru_cache
from typing import Sequence
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
class Reranker:
def __init__(self, model_name: str, device: str, batch_size: int) -> None:
self.model_name = model_name
self.device = device
self.batch_size = batch_size
self._impl: str | None = None
self._model = None
self._load()
def _load(self) -> None:
try:
from FlagEmbedding import FlagReranker # type: ignore
use_fp16 = self.device != "cpu"
self._model = FlagReranker(self.model_name, use_fp16=use_fp16, devices=self.device)
self._impl = "flagembedding"
logger.info("reranker.loaded", impl="flagembedding", model=self.model_name, device=self.device)
return
except Exception as exc: # noqa: BLE001
logger.warning("reranker.flagembedding_failed", error=str(exc))
try:
from sentence_transformers import CrossEncoder
self._model = CrossEncoder(self.model_name, device=self.device)
self._impl = "sentence-transformers"
logger.info("reranker.loaded", impl="sentence-transformers", model=self.model_name)
except Exception as exc: # noqa: BLE001
logger.error("reranker.disabled", error=str(exc))
self._impl = None
self._model = None
@property
def available(self) -> bool:
return self._impl is not None and self._model is not None
def score(self, query: str, passages: Sequence[str]) -> list[float]:
if not self.available or not passages:
return [0.0] * len(passages)
pairs = [(query, p) for p in passages]
if self._impl == "flagembedding":
scores = self._model.compute_score(pairs, batch_size=self.batch_size, normalize=True) # type: ignore[union-attr]
else:
scores = self._model.predict(pairs, batch_size=self.batch_size) # type: ignore[union-attr]
if not isinstance(scores, list):
try:
scores = list(scores)
except TypeError:
scores = [float(scores)]
return [float(s) for s in scores]
@lru_cache(maxsize=1)
def get_reranker() -> Reranker:
return Reranker(
model_name=settings.reranker_model,
device=settings.reranker_device,
batch_size=settings.reranker_batch_size,
)

View File

317
app/ingestion/chunker.py Normal file
View File

@@ -0,0 +1,317 @@
"""Structure-aware chunking.
Rules (per spec):
- Chunk by document structure first, fixed-size second.
- Hierarchy: title > heading > paragraph > list > table > figure caption.
- Target 500-900 tokens (configurable).
- Overlap 80-120 tokens for long narrative text only.
- Never split tables - one table = one chunk (or one chunk per row group if huge).
- Every chunk carries citation metadata.
We use a deliberately simple ``len(text.split())`` token estimator. The downstream
embedding model has its own tokenizer; this estimator is only a budget proxy.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from app.config import settings
from app.ingestion.docling_extractor import (
ExtractedBlock,
ExtractedFigure,
ExtractedTable,
ExtractionResult,
)
from app.ingestion.normalizer import normalize_block
from app.ingestion.quality import compute_quality_flags
@dataclass
class ChunkRecord:
chunk_index: int
page_number: int
block_type: str
text: str
normalized_text: str
token_count: int
block_id: str | None = None
quality_flags: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict)
def _estimate_tokens(text: str) -> int:
return max(1, len(text.split()))
def chunk_extraction(
extraction: ExtractionResult,
*,
document_ocr_confidence: float | None = None,
) -> list[ChunkRecord]:
target = settings.chunk_target_tokens
minimum = settings.chunk_min_tokens
maximum = settings.chunk_max_tokens
overlap = settings.chunk_overlap_tokens
chunks: list[ChunkRecord] = []
idx = 0
# 1) Tables first - one chunk per table, never split.
for t in extraction.tables:
body = (t.markdown or "").strip()
if not body:
continue
summary = _summarize_table(t)
text = body
if summary:
text = f"{summary}\n\n{body}"
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type="table",
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=t.page_number,
block_type="table",
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=t.block_id or f"table:{t.table_index}",
quality_flags=flags,
metadata={"table_index": t.table_index, "summary": summary or ""},
)
)
idx += 1
# 2) Figures - caption + placeholder description.
for f in extraction.figures:
text_parts: list[str] = []
if f.caption:
text_parts.append(f"Caption: {f.caption}")
text_parts.append(f"Figure detected on page {f.page_number}.")
text = "\n".join(text_parts)
block_type = "figure_caption" if f.caption else "figure_description"
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type=block_type,
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=f.page_number,
block_type=block_type,
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=f.block_id or f"figure:{f.figure_index}",
quality_flags=flags,
metadata={"figure_index": f.figure_index},
)
)
idx += 1
# 3) Narrative blocks grouped per page, packed by structure.
by_page: dict[int, list[ExtractedBlock]] = {}
for b in extraction.blocks:
by_page.setdefault(b.page_number, []).append(b)
for page_no in sorted(by_page):
blocks = by_page[page_no]
groups = _group_by_section(blocks)
for group in groups:
packed = _pack_group(group, target=target, maximum=maximum, minimum=minimum)
for piece in packed:
text = piece["text"]
btype = piece["block_type"]
display, norm = normalize_block(text)
flags = compute_quality_flags(
text=display,
block_type=btype,
ocr_confidence=document_ocr_confidence,
)
chunks.append(
ChunkRecord(
chunk_index=idx,
page_number=page_no,
block_type=btype,
text=display,
normalized_text=norm,
token_count=_estimate_tokens(display),
block_id=piece.get("block_id"),
quality_flags=flags,
metadata={"section_heading": piece.get("section") or ""},
)
)
idx += 1
# Optional overlap: only if the last piece is long narrative
if overlap > 0 and packed and packed[-1]["block_type"] == "paragraph":
tail = _tail_tokens(packed[-1]["text"], overlap)
if tail and len(tail.split()) >= max(20, overlap // 2):
# Overlap is already represented by next-group adjacency in
# most legacy docs; we do not emit duplicate overlap chunks
# to avoid index bloat. This is intentional per spec note
# ("only for long narrative text") - left here for future tuning.
pass
return chunks
# ---------------- Helpers ----------------
def _group_by_section(blocks: list[ExtractedBlock]) -> list[list[ExtractedBlock]]:
groups: list[list[ExtractedBlock]] = []
current: list[ExtractedBlock] = []
for b in blocks:
if b.block_type in ("title", "heading") and current:
groups.append(current)
current = [b]
else:
current.append(b)
if current:
groups.append(current)
return groups
def _pack_group(
group: list[ExtractedBlock], *, target: int, maximum: int, minimum: int
) -> list[dict[str, Any]]:
"""Pack a section's blocks into chunks at most ``maximum`` tokens.
Headings / titles attach to the next chunk as a section anchor.
"""
if not group:
return []
section_heading = ""
body_blocks: list[ExtractedBlock] = []
for b in group:
if b.block_type in ("title", "heading"):
section_heading = (section_heading + " > " + b.text).strip(" >") if section_heading else b.text
else:
body_blocks.append(b)
if not body_blocks:
# Heading-only group: emit as a single ``heading`` chunk so the title is searchable.
text = section_heading or group[0].text
return [
{
"text": text,
"block_type": "heading",
"block_id": group[0].block_id,
"section": section_heading,
}
]
out: list[dict[str, Any]] = []
buffer: list[str] = []
buffer_block_ids: list[str] = []
buffer_block_type = "paragraph"
buffer_tokens = 0
def flush():
nonlocal buffer, buffer_block_ids, buffer_block_type, buffer_tokens
if not buffer:
return
text = "\n\n".join(buffer).strip()
if not text:
buffer = []
buffer_block_ids = []
buffer_tokens = 0
return
# Prepend section heading for context (kept short).
if section_heading and len(section_heading) < 200:
text = f"# {section_heading}\n\n{text}"
out.append(
{
"text": text,
"block_type": buffer_block_type,
"block_id": buffer_block_ids[0] if buffer_block_ids else None,
"section": section_heading,
}
)
buffer = []
buffer_block_ids = []
buffer_tokens = 0
for b in body_blocks:
tokens = _estimate_tokens(b.text)
if tokens >= maximum:
# Hard split a giant block into sub-chunks of ~target tokens.
flush()
for sub in _split_long_text(b.text, target=target, maximum=maximum):
out.append(
{
"text": sub,
"block_type": b.block_type if b.block_type != "list" else "list",
"block_id": b.block_id,
"section": section_heading,
}
)
continue
if buffer_tokens + tokens > maximum and buffer_tokens >= minimum:
flush()
if not buffer:
buffer_block_type = b.block_type if b.block_type != "list" else "list"
buffer.append(b.text)
if b.block_id:
buffer_block_ids.append(b.block_id)
buffer_tokens += tokens
if buffer_tokens >= target:
flush()
flush()
return out
def _split_long_text(text: str, *, target: int, maximum: int) -> list[str]:
words = text.split()
if not words:
return []
pieces: list[str] = []
step = target
if step <= 0:
step = 500
i = 0
while i < len(words):
end = min(len(words), i + maximum)
# Aim for ``target`` words but extend up to ``maximum`` to reach a sentence boundary.
piece = " ".join(words[i : i + step])
pieces.append(piece)
i += step
if end - i < target // 4 and end - i > 0:
pieces[-1] = " ".join(words[i - step : end])
break
return pieces
def _tail_tokens(text: str, n: int) -> str:
words = text.split()
if len(words) <= n:
return text
return " ".join(words[-n:])
def _summarize_table(t: ExtractedTable) -> str:
"""Heuristic one-line summary for index recall."""
md = t.markdown or ""
first = next((line for line in md.splitlines() if line.startswith("|")), "")
header_cells = [c.strip() for c in first.strip("|").split("|") if c.strip()]
n_cols = len(header_cells)
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
header_preview = ", ".join(header_cells[:6])
return (
f"Table on page {t.page_number}: {n_rows} rows x {n_cols} cols. "
f"Columns: {header_preview}." if header_cells else
f"Table on page {t.page_number}."
)

View File

@@ -0,0 +1,384 @@
"""Docling structured extraction.
Docling produces a hierarchical document model with reading order, layout, tables
and figures. We export both Markdown and a JSON representation, then walk the
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
figure caption) for downstream chunking.
The extractor is intentionally defensive: Docling's exact Python API has
shifted across releases. We probe for the safest exporter methods and fall
back to ``str(document)`` only as a last resort.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
@dataclass
class ExtractedBlock:
page_number: int
block_type: str
text: str
block_id: str | None = None
extra: dict[str, Any] = field(default_factory=dict)
@dataclass
class ExtractedTable:
page_number: int
table_index: int
markdown: str
csv_text: str | None = None
json_data: dict[str, Any] | None = None
block_id: str | None = None
@dataclass
class ExtractedFigure:
page_number: int
figure_index: int
caption: str | None
block_id: str | None = None
image_bytes: bytes | None = None
image_ext: str = "png"
@dataclass
class ExtractedPage:
page_number: int
text: str
has_tables: bool = False
has_figures: bool = False
has_handwriting: bool = False
ocr_confidence: float | None = None
@dataclass
class ExtractionResult:
markdown: str
json_payload: dict[str, Any]
blocks: list[ExtractedBlock]
tables: list[ExtractedTable]
figures: list[ExtractedFigure]
pages: list[ExtractedPage]
def extract(pdf_path: Path) -> ExtractionResult:
"""Run Docling on ``pdf_path`` and return a normalized result."""
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
pipeline_options = PdfPipelineOptions()
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
pipeline_options.do_ocr = settings.docling_ocr_enabled
pipeline_options.do_table_structure = True
try:
pipeline_options.table_structure_options.do_cell_matching = True
except Exception: # noqa: BLE001 - older docling versions lack this
pass
try:
pipeline_options.generate_page_images = True
except Exception: # noqa: BLE001
pass
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
logger.info("docling.start", input=str(pdf_path))
conv = converter.convert(str(pdf_path))
doc = conv.document
markdown = _safe_export_markdown(doc)
json_payload = _safe_export_dict(doc)
blocks = _walk_blocks(json_payload)
tables = _walk_tables(doc, json_payload)
figures = _walk_figures(doc, json_payload)
pages = _walk_pages(json_payload, blocks, tables, figures)
logger.info(
"docling.done",
pages=len(pages),
blocks=len(blocks),
tables=len(tables),
figures=len(figures),
)
return ExtractionResult(
markdown=markdown,
json_payload=json_payload,
blocks=blocks,
tables=tables,
figures=figures,
pages=pages,
)
# ---------------- Internal helpers ----------------
def _safe_export_markdown(doc: Any) -> str:
for attr in ("export_to_markdown", "to_markdown"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
return fn()
except Exception: # noqa: BLE001
continue
return str(doc)
def _safe_export_dict(doc: Any) -> dict[str, Any]:
for attr in ("export_to_dict", "model_dump", "dict"):
fn = getattr(doc, attr, None)
if callable(fn):
try:
data = fn()
if isinstance(data, dict):
return data
except Exception: # noqa: BLE001
continue
# Last resort: serialize via JSON round-trip
try:
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
except Exception: # noqa: BLE001
return {}
_DOCLING_LABEL_TO_BLOCK = {
"title": "title",
"section_header": "heading",
"section-header": "heading",
"subtitle": "heading",
"page_header": "heading",
"header": "heading",
"list_item": "list",
"list-item": "list",
"list": "list",
"paragraph": "paragraph",
"text": "paragraph",
"caption": "figure_caption",
"figure": "figure_caption",
"table": "table",
"footnote": "paragraph",
}
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
"""Flatten Docling's text items into ordered blocks per page."""
blocks: list[ExtractedBlock] = []
items = (
payload.get("texts")
or payload.get("text_items")
or payload.get("body", {}).get("text_items", [])
or []
)
if not isinstance(items, list):
return blocks
for item in items:
if not isinstance(item, dict):
continue
label = (item.get("label") or item.get("category") or "paragraph").lower()
text = (item.get("text") or "").strip()
if not text:
continue
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
page = _page_of(item)
blocks.append(
ExtractedBlock(
page_number=page,
block_type=block_type,
text=text,
block_id=item.get("self_ref") or item.get("id"),
extra={"label": label},
)
)
return blocks
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
tables: list[ExtractedTable] = []
raw_tables = payload.get("tables") or []
for idx, t in enumerate(raw_tables):
if not isinstance(t, dict):
continue
page = _page_of(t)
md = _table_markdown(doc, t, idx)
csv_text = _table_csv(t)
tables.append(
ExtractedTable(
page_number=page,
table_index=idx,
markdown=md,
csv_text=csv_text,
json_data=t,
block_id=t.get("self_ref") or t.get("id"),
)
)
return tables
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
figures: list[ExtractedFigure] = []
raw_figures = payload.get("pictures") or payload.get("figures") or []
for idx, f in enumerate(raw_figures):
if not isinstance(f, dict):
continue
page = _page_of(f)
caption = (f.get("caption") or "").strip() or None
figures.append(
ExtractedFigure(
page_number=page,
figure_index=idx,
caption=caption,
block_id=f.get("self_ref") or f.get("id"),
)
)
return figures
def _walk_pages(
payload: dict[str, Any],
blocks: list[ExtractedBlock],
tables: list[ExtractedTable],
figures: list[ExtractedFigure],
) -> list[ExtractedPage]:
pages_meta = payload.get("pages") or {}
page_numbers: set[int] = set()
if isinstance(pages_meta, dict):
for k in pages_meta.keys():
try:
page_numbers.add(int(k))
except (ValueError, TypeError):
continue
elif isinstance(pages_meta, list):
for p in pages_meta:
if isinstance(p, dict):
pn = p.get("page_no") or p.get("page") or p.get("number")
if isinstance(pn, int):
page_numbers.add(pn)
for b in blocks:
page_numbers.add(b.page_number)
for t in tables:
page_numbers.add(t.page_number)
for f in figures:
page_numbers.add(f.page_number)
page_numbers.discard(0)
if not page_numbers:
page_numbers = {1}
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
for b in blocks:
by_page_text.setdefault(b.page_number, []).append(b.text)
has_tables_set = {t.page_number for t in tables}
has_figures_set = {f.page_number for f in figures}
return [
ExtractedPage(
page_number=pn,
text="\n\n".join(by_page_text.get(pn, [])),
has_tables=pn in has_tables_set,
has_figures=pn in has_figures_set,
)
for pn in sorted(page_numbers)
]
def _page_of(item: dict[str, Any]) -> int:
prov = item.get("prov") or item.get("provenance")
if isinstance(prov, list) and prov:
first = prov[0]
if isinstance(first, dict):
pn = first.get("page_no") or first.get("page") or first.get("page_number")
if isinstance(pn, int):
return pn
pn = item.get("page_no") or item.get("page") or item.get("page_number")
if isinstance(pn, int):
return pn
return 1
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
# Try Docling's own export first (per-table).
try:
export = getattr(doc, "export_table_to_markdown", None)
if callable(export):
return export(idx)
except Exception: # noqa: BLE001
pass
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
if isinstance(grid, list) and grid and isinstance(grid[0], list):
return _grid_to_markdown(grid)
cells = raw.get("table_cells")
if isinstance(cells, list):
return _cells_to_markdown(cells)
return ""
def _grid_to_markdown(grid: list[list[Any]]) -> str:
if not grid:
return ""
def _cell(c: Any) -> str:
if isinstance(c, dict):
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
return str(c).replace("|", "\\|").strip()
header = grid[0]
body = grid[1:] if len(grid) > 1 else []
cols = len(header)
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
out.append("| " + " | ".join(["---"] * cols) + " |")
for row in body:
cells = [_cell(c) for c in row]
if len(cells) < cols:
cells += [""] * (cols - len(cells))
out.append("| " + " | ".join(cells[:cols]) + " |")
return "\n".join(out)
def _cells_to_markdown(cells: list[Any]) -> str:
rows: dict[int, dict[int, str]] = {}
for c in cells:
if not isinstance(c, dict):
continue
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
if not rows:
return ""
max_col = max((max(r.keys()) for r in rows.values()), default=0)
grid = []
for r_idx in sorted(rows):
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
grid.append(row)
return _grid_to_markdown(grid)
def _table_csv(raw: dict[str, Any]) -> str | None:
grid = raw.get("data") or raw.get("grid")
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
return None
import csv
import io
buf = io.StringIO()
writer = csv.writer(buf)
for row in grid:
writer.writerow([
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
])
return buf.getvalue()

View File

@@ -0,0 +1,78 @@
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
from __future__ import annotations
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Figure
from app.ingestion.docling_extractor import ExtractedFigure
from app.logging_config import get_logger
from app.storage.local_paths import key_figure_crop
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_figures(
db,
storage: MinioStorage,
document_id: uuid.UUID,
figures: list[ExtractedFigure],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for f in figures:
existing = db.execute(
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
).scalar_one_or_none()
if existing is None:
existing = Figure(
document_id=document_id,
page_id=page_id_by_number.get(f.page_number),
page_number=f.page_number,
figure_index=f.figure_index,
)
db.add(existing)
existing.caption = f.caption
existing.description = (
f"Figure detected on page {f.page_number}." if not f.caption else
f"Figure on page {f.page_number}. Caption: {f.caption}"
)
if f.image_bytes:
key = key_figure_crop(document_id, f.page_number, f.figure_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=f.image_bytes,
content_type=f"image/{f.image_ext}",
)
existing.storage_bucket = storage.derived_bucket
existing.storage_key = key
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
count += 1
return count
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)

View File

@@ -0,0 +1,12 @@
"""Block-level normalization wrappers around utils.text_cleaning."""
from __future__ import annotations
from app.utils.text_cleaning import clean_ocr_text, normalize_for_search
def normalize_block(text: str) -> tuple[str, str]:
"""Return ``(display_text, normalized_text)``."""
display = clean_ocr_text(text)
norm = normalize_for_search(display)
return display, norm

87
app/ingestion/ocr.py Normal file
View File

@@ -0,0 +1,87 @@
"""OCRmyPDF integration with Tesseract.
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
disabled by config), we skip OCR and use the original PDF. On failure, the
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import ocrmypdf
from app.config import settings
from app.logging_config import get_logger
from app.utils.pdf import has_searchable_text
logger = get_logger(__name__)
@dataclass
class OcrResult:
output_path: Path
skipped: bool
reason: str
languages: str
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
"""Run OCRmyPDF.
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
- If the input already has searchable text: skip OCR but still produce
``output_pdf`` (a hard-link / copy to keep downstream code simple).
- On unexpected exceptions: re-raise (caller handles status update).
"""
langs = languages or settings.ocr_languages
if not settings.ocr_enabled:
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
if has_searchable_text(input_pdf):
return _skip(input_pdf, output_pdf, langs, "already_searchable")
output_pdf.parent.mkdir(parents=True, exist_ok=True)
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
try:
ocrmypdf.ocr(
input_file=str(input_pdf),
output_file=str(output_pdf),
language=langs,
skip_text=False,
redo_ocr=False,
force_ocr=False,
deskew=settings.ocr_deskew,
clean=settings.ocr_clean,
optimize=settings.ocr_optimize,
progress_bar=False,
jobs=1,
output_type="pdf",
# tolerate already-OCR pages where present
skip_big=200.0,
)
except ocrmypdf.exceptions.PriorOcrFoundError:
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
except ocrmypdf.exceptions.DigitalSignatureError:
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
except ocrmypdf.exceptions.EncryptedPdfError as exc:
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
raise
except ocrmypdf.exceptions.MissingDependencyError as exc:
logger.error("ocr.missing_dependency", error=str(exc))
raise
logger.info("ocr.done", output=str(output_pdf))
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
output_pdf.parent.mkdir(parents=True, exist_ok=True)
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
output_pdf.write_bytes(input_pdf.read_bytes())
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)

384
app/ingestion/pipeline.py Normal file
View File

@@ -0,0 +1,384 @@
"""Per-document end-to-end pipeline: OCR -> Docling -> chunk -> persist -> index.
Called by the Celery worker. Idempotent: re-running on the same document deletes
existing chunks for that document and re-creates them, then re-indexes in
OpenSearch and Qdrant.
"""
from __future__ import annotations
import json
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from sqlalchemy import delete, select
from app.config import settings
from app.db.models import (
ArtifactType,
Chunk,
Document,
DocumentArtifact,
DocumentStatus,
Page,
ProcessingEvent,
)
from app.db.session import session_scope
from app.indexing import opensearch_client, qdrant_client
from app.indexing.embeddings import get_embedder
from app.ingestion.chunker import ChunkRecord, chunk_extraction
from app.ingestion.docling_extractor import ExtractionResult, extract
from app.ingestion.figure_processor import persist_figures
from app.ingestion.ocr import run_ocr
from app.ingestion.table_processor import persist_tables
from app.logging_config import get_logger
from app.storage.local_paths import (
key_docling_json,
key_markdown,
key_ocr_pdf,
work_dir_for,
)
from app.storage.minio_client import get_storage
from app.utils.language import detect_language
logger = get_logger(__name__)
def process_document_id(document_id: uuid.UUID, run_id: uuid.UUID | None = None) -> dict[str, Any]:
"""Top-level entry called by the Celery task. Wraps the pipeline in
error handling so the task always either succeeds or marks the document FAILED.
"""
storage = get_storage()
storage.ensure_buckets()
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is None:
logger.warning("pipeline.document_missing", document_id=str(document_id))
return {"status": "missing"}
source_path = Path(doc.source_path)
sha = doc.sha256
original_artifact = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == doc.id,
DocumentArtifact.artifact_type == ArtifactType.ORIGINAL_PDF,
)
).scalar_one_or_none()
work_dir = work_dir_for(document_id)
local_pdf = work_dir / f"{sha}.pdf"
if not local_pdf.exists():
if source_path.exists():
local_pdf.write_bytes(source_path.read_bytes())
elif original_artifact:
storage.get_to_path(original_artifact.storage_bucket, original_artifact.storage_key, local_pdf)
else:
return _fail(document_id, run_id, "OCR_FAILED", "Original PDF not available locally or in MinIO")
# ---------------- OCR ----------------
ocr_pdf = work_dir / "ocr.pdf"
try:
_emit_event(document_id, run_id, DocumentStatus.OCR_STARTED, "OCR started")
ocr_result = run_ocr(local_pdf, ocr_pdf, languages=settings.ocr_languages)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.ocr_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.OCR_FAILED, f"OCR failed: {exc}")
# Upload OCR PDF (even if we 'skipped' it - OCR PDF is the canonical input to Docling).
ocr_key = key_ocr_pdf(document_id)
storage.put_file(
bucket=storage.derived_bucket,
key=ocr_key,
path=ocr_result.output_path,
content_type="application/pdf",
)
with session_scope() as db:
_ensure_artifact(db, document_id, ArtifactType.OCR_PDF, storage.derived_bucket, ocr_key)
doc = db.get(Document, document_id)
if doc is not None:
doc.status = DocumentStatus.OCR_COMPLETED
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.OCR_COMPLETED,
level="INFO",
message=f"OCR finished ({ocr_result.reason})",
data={"skipped": ocr_result.skipped, "languages": ocr_result.languages},
)
)
# ---------------- Docling ----------------
try:
_emit_event(document_id, run_id, DocumentStatus.EXTRACTION_STARTED, "Docling extraction started")
extraction = extract(ocr_result.output_path)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.docling_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.EXTRACTION_FAILED, f"Docling failed: {exc}")
# Persist Markdown + JSON to MinIO.
md_key = key_markdown(document_id)
json_key = key_docling_json(document_id)
storage.put_bytes(
bucket=storage.derived_bucket,
key=md_key,
data=extraction.markdown.encode("utf-8"),
content_type="text/markdown",
)
storage.put_bytes(
bucket=storage.derived_bucket,
key=json_key,
data=json.dumps(extraction.json_payload, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
# ---------------- Persist pages, chunks, tables, figures ----------------
chunk_records = chunk_extraction(extraction)
sample_text = "\n".join(p.text for p in extraction.pages[:3] if p.text)
lang = detect_language(sample_text)
with session_scope() as db:
_ensure_artifact(db, document_id, ArtifactType.MARKDOWN, storage.derived_bucket, md_key)
_ensure_artifact(db, document_id, ArtifactType.DOCLING_JSON, storage.derived_bucket, json_key)
doc = db.get(Document, document_id)
if doc is None:
return {"status": "missing"}
doc.status = DocumentStatus.EXTRACTION_COMPLETED
if lang and not doc.language_hint:
doc.language_hint = lang
page_id_by_number = _upsert_pages(db, document_id, extraction)
persist_tables(db, storage, document_id, extraction.tables, page_id_by_number)
persist_figures(db, storage, document_id, extraction.figures, page_id_by_number)
# Replace chunks idempotently: drop all and re-insert.
db.execute(delete(Chunk).where(Chunk.document_id == document_id))
for cr in chunk_records:
db.add(_to_chunk_row(document_id, page_id_by_number, cr))
doc.status = DocumentStatus.CHUNKING_COMPLETED
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.CHUNKING_COMPLETED,
level="INFO",
message="Chunking complete",
data={"chunks": len(chunk_records)},
)
)
# ---------------- Indexing (OpenSearch + Qdrant) ----------------
try:
opensearch_client.ensure_index()
qdrant_client.ensure_collection()
opensearch_client.delete_by_document(str(document_id))
qdrant_client.delete_by_document(str(document_id))
os_docs, qdrant_points = _build_index_payloads(document_id, chunk_records, extraction, lang)
if os_docs:
opensearch_client.index_chunks(os_docs)
if qdrant_points:
embedder = get_embedder()
texts_to_embed = [text for _, text, _ in qdrant_points]
vectors = embedder.encode(texts_to_embed)
triples = [
(chunk_id, vec, payload)
for (chunk_id, _text, payload), vec in zip(qdrant_points, vectors, strict=True)
]
qdrant_client.upsert_chunks(triples)
except Exception as exc: # noqa: BLE001
logger.exception("pipeline.indexing_failed", document_id=str(document_id))
return _fail(document_id, run_id, DocumentStatus.FAILED, f"Indexing failed: {exc}")
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is not None:
doc.status = DocumentStatus.INDEXING_COMPLETED
doc.error_message = None
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=DocumentStatus.INDEXING_COMPLETED,
level="INFO",
message="Indexing complete",
data={"chunks": len(chunk_records)},
)
)
return {"status": DocumentStatus.INDEXING_COMPLETED, "chunks": len(chunk_records)}
# ---------------- helpers ----------------
def _to_chunk_row(
document_id: uuid.UUID, page_id_by_number: dict[int, uuid.UUID], cr: ChunkRecord
) -> Chunk:
return Chunk(
document_id=document_id,
page_id=page_id_by_number.get(cr.page_number),
page_number=cr.page_number,
block_id=cr.block_id,
chunk_index=cr.chunk_index,
block_type=cr.block_type,
text=cr.text,
normalized_text=cr.normalized_text,
token_count=cr.token_count,
ocr_confidence=None,
quality_flags=cr.quality_flags,
chunk_metadata=cr.metadata,
)
def _upsert_pages(db, document_id: uuid.UUID, extraction: ExtractionResult) -> dict[int, uuid.UUID]:
existing = {
p.page_number: p
for p in db.execute(select(Page).where(Page.document_id == document_id)).scalars()
}
out: dict[int, uuid.UUID] = {}
for ep in extraction.pages:
page = existing.get(ep.page_number)
if page is None:
page = Page(
document_id=document_id,
page_number=ep.page_number,
text=ep.text,
ocr_confidence=ep.ocr_confidence,
has_tables=ep.has_tables,
has_figures=ep.has_figures,
has_handwriting=ep.has_handwriting,
)
db.add(page)
db.flush()
else:
page.text = ep.text
page.has_tables = ep.has_tables
page.has_figures = ep.has_figures
page.has_handwriting = ep.has_handwriting
out[ep.page_number] = page.id
return out
def _build_index_payloads(
document_id: uuid.UUID,
chunks: list[ChunkRecord],
extraction: ExtractionResult,
language_hint: str | None,
) -> tuple[list[dict[str, Any]], list[tuple[str, str, dict[str, Any]]]]:
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is None:
return [], []
original_file_name = doc.original_file_name
source_path = doc.source_path
chunk_rows = (
db.execute(select(Chunk).where(Chunk.document_id == document_id))
.scalars()
.all()
)
os_docs: list[dict[str, Any]] = []
qdrant: list[tuple[str, str, dict[str, Any]]] = []
for row in chunk_rows:
chunk_id = str(row.id)
text = row.text or ""
os_docs.append(
{
"chunk_id": chunk_id,
"document_id": str(document_id),
"source_path": source_path,
"original_file_name": original_file_name,
"page_number": row.page_number,
"block_type": row.block_type,
"block_id": row.block_id,
"text": text,
"normalized_text": row.normalized_text,
"ocr_confidence": row.ocr_confidence,
"language_hint": language_hint,
"metadata": row.chunk_metadata or {},
"quality_flags": row.quality_flags or {},
"created_at": (row.created_at or datetime.now(tz=timezone.utc)).isoformat(),
}
)
text_preview = text[:512]
qdrant.append(
(
chunk_id,
text,
{
"document_id": str(document_id),
"source_path": source_path,
"original_file_name": original_file_name,
"page_number": row.page_number,
"block_type": row.block_type,
"block_id": row.block_id,
"text_preview": text_preview,
"ocr_confidence": row.ocr_confidence,
"quality_flags": row.quality_flags or {},
"metadata": row.chunk_metadata or {},
},
)
)
return os_docs, qdrant
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
)
)
def _emit_event(document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str) -> None:
with session_scope() as db:
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=stage,
level="INFO",
message=message,
data={},
)
)
def _fail(
document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str
) -> dict[str, Any]:
with session_scope() as db:
doc = db.get(Document, document_id)
if doc is not None:
doc.status = stage
doc.error_message = message[:2000]
db.add(
ProcessingEvent(
run_id=run_id,
document_id=document_id,
stage=stage,
level="ERROR",
message=message,
data={},
)
)
logger.error("pipeline.failed", document_id=str(document_id), stage=stage, message=message)
return {"status": stage, "error": message}

41
app/ingestion/quality.py Normal file
View File

@@ -0,0 +1,41 @@
"""Quality flag computation for chunks."""
from __future__ import annotations
from typing import Any
from app.utils.text_cleaning import looks_garbled
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
SHORT_TEXT_THRESHOLD = 24
def compute_quality_flags(
*,
text: str,
block_type: str,
ocr_confidence: float | None,
has_handwriting: bool = False,
) -> dict[str, Any]:
flags: dict[str, Any] = {
"low_ocr_confidence": False,
"very_short_text": False,
"possible_garbled_text": False,
"table_detected": block_type == "table",
"figure_detected": block_type in ("figure_caption", "figure_description"),
"handwriting_detected": has_handwriting or block_type == "handwriting",
"needs_manual_review": False,
}
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
flags["low_ocr_confidence"] = True
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
flags["very_short_text"] = True
if looks_garbled(text):
flags["possible_garbled_text"] = True
if (
flags["low_ocr_confidence"]
or flags["possible_garbled_text"]
or flags["handwriting_detected"]
):
flags["needs_manual_review"] = True
return flags

184
app/ingestion/scanner.py Normal file
View File

@@ -0,0 +1,184 @@
"""Folder scanner: discovers PDFs, deduplicates by SHA256, persists discovery rows.
The scanner does NOT trigger OCR or extraction. It only:
- enumerates PDF files,
- hashes each file,
- creates / reuses a ``Document`` row,
- uploads the original PDF to MinIO,
- emits ``DISCOVERED`` / ``STORED_ORIGINAL`` events.
Heavy work (OCR, Docling, indexing) is performed by the Celery worker pipeline.
"""
from __future__ import annotations
import os
import uuid
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from sqlalchemy import select
from app.db.models import (
ArtifactType,
Document,
DocumentArtifact,
DocumentStatus,
ProcessingEvent,
)
from app.db.session import session_scope
from app.logging_config import get_logger
from app.storage.local_paths import key_original_pdf
from app.storage.minio_client import get_storage
from app.utils.hashing import sha256_file
from app.utils.pdf import is_pdf
logger = get_logger(__name__)
@dataclass
class DiscoveryRecord:
path: Path
sha256: str | None
document_id: uuid.UUID | None
duplicate: bool
invalid: bool = False
def iter_pdf_files(root: Path, recursive: bool = True) -> Iterator[Path]:
if root.is_file():
if is_pdf(root):
yield root
return
if recursive:
for dirpath, _dirnames, filenames in os.walk(root):
for name in filenames:
p = Path(dirpath) / name
if is_pdf(p):
yield p
else:
for p in root.iterdir():
if is_pdf(p):
yield p
def discover_documents(
root: Path, recursive: bool = True, force: bool = False
) -> Iterator[DiscoveryRecord]:
storage = get_storage()
storage.ensure_buckets()
for path in iter_pdf_files(root, recursive=recursive):
try:
stat = path.stat()
sha = sha256_file(path)
except Exception as exc: # noqa: BLE001
logger.warning("scan.invalid_file", path=str(path), error=str(exc))
yield DiscoveryRecord(path=path, sha256=None, document_id=None, duplicate=False, invalid=True)
continue
with session_scope() as db:
existing = db.execute(
select(Document).where(Document.sha256 == sha)
).scalar_one_or_none()
if existing and not force:
logger.debug("scan.duplicate", path=str(path), sha256=sha, document_id=str(existing.id))
yield DiscoveryRecord(path=path, sha256=sha, document_id=existing.id, duplicate=True)
continue
doc = existing or Document(
id=uuid.uuid4(),
source_path=str(path),
original_file_name=path.name,
sha256=sha,
file_size_bytes=stat.st_size,
mime_type="application/pdf",
status=DocumentStatus.DISCOVERED,
)
if not existing:
db.add(doc)
db.flush()
db.add(
ProcessingEvent(
document_id=doc.id,
stage=DocumentStatus.DISCOVERED,
level="INFO",
message="Document discovered",
data={"sha256": sha, "size": stat.st_size, "path": str(path)},
)
)
# Upload original (idempotent) and record artifact if missing.
key = key_original_pdf(doc.id, sha)
try:
if not storage.exists(storage.originals_bucket, key):
storage.put_file(
bucket=storage.originals_bucket,
key=key,
path=path,
content_type="application/pdf",
metadata={"sha256": sha, "original-name": path.name[:255]},
)
_ensure_artifact(
db,
doc.id,
ArtifactType.ORIGINAL_PDF,
storage.originals_bucket,
key,
sha,
)
if doc.status == DocumentStatus.DISCOVERED:
doc.status = DocumentStatus.STORED_ORIGINAL
db.add(
ProcessingEvent(
document_id=doc.id,
stage=DocumentStatus.STORED_ORIGINAL,
level="INFO",
message="Original stored to MinIO",
data={"bucket": storage.originals_bucket, "key": key},
)
)
except Exception as exc: # noqa: BLE001
logger.error("scan.store_failed", path=str(path), error=str(exc))
doc.status = DocumentStatus.FAILED
doc.error_message = f"store_original: {exc}"
db.add(
ProcessingEvent(
document_id=doc.id,
stage="STORE_FAILED",
level="ERROR",
message=str(exc),
data={"path": str(path)},
)
)
yield DiscoveryRecord(path=path, sha256=sha, document_id=None, duplicate=False, invalid=True)
continue
yield DiscoveryRecord(
path=path, sha256=sha, document_id=doc.id, duplicate=bool(existing)
)
def _ensure_artifact(
db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.artifact_type == artifact_type,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
checksum=checksum,
)
)

View File

@@ -0,0 +1,84 @@
"""Persists Docling tables to PostgreSQL + MinIO."""
from __future__ import annotations
import json
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Table
from app.ingestion.docling_extractor import ExtractedTable
from app.logging_config import get_logger
from app.storage.local_paths import key_table_json
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_tables(
db,
storage: MinioStorage,
document_id: uuid.UUID,
tables: list[ExtractedTable],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for t in tables:
existing = db.execute(
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
).scalar_one_or_none()
if existing is None:
existing = Table(
document_id=document_id,
page_id=page_id_by_number.get(t.page_number),
page_number=t.page_number,
table_index=t.table_index,
)
db.add(existing)
existing.markdown = t.markdown or ""
existing.csv_text = t.csv_text
existing.json_data = t.json_data
existing.summary = _summary(t)
db.flush()
# Persist json blob to MinIO for large/inspectable copies.
if t.json_data:
key = key_table_json(document_id, t.table_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
count += 1
return count
def _summary(t: ExtractedTable) -> str:
md = t.markdown or ""
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)

61
app/logging_config.py Normal file
View File

@@ -0,0 +1,61 @@
"""Structured logging via structlog with stdlib bridge.
All modules use ``get_logger(__name__)`` and emit key/value pairs.
"""
from __future__ import annotations
import logging
import sys
from typing import Any
import structlog
from app.config import settings
def configure_logging() -> None:
level = getattr(logging, settings.app_log_level.upper(), logging.INFO)
timestamper = structlog.processors.TimeStamper(fmt="iso", utc=True)
shared_processors: list[Any] = [
structlog.contextvars.merge_contextvars,
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
timestamper,
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
]
structlog.configure(
processors=shared_processors
+ [structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
formatter = structlog.stdlib.ProcessorFormatter(
foreign_pre_chain=shared_processors,
processors=[
structlog.stdlib.ProcessorFormatter.remove_processors_meta,
structlog.processors.JSONRenderer(),
],
)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root = logging.getLogger()
root.handlers.clear()
root.addHandler(handler)
root.setLevel(level)
# Quiet down noisy libs
for noisy in ("urllib3", "botocore", "s3transfer", "elasticsearch", "opensearch", "httpx"):
logging.getLogger(noisy).setLevel(logging.WARNING)
def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
return structlog.get_logger(name)

52
app/main.py Normal file
View File

@@ -0,0 +1,52 @@
"""FastAPI entrypoint."""
from __future__ import annotations
from contextlib import asynccontextmanager
from typing import AsyncIterator
from fastapi import FastAPI
from app import __version__
from app.api import routes_health, routes_ingestion, routes_search
from app.config import settings
from app.logging_config import configure_logging, get_logger
configure_logging()
logger = get_logger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
logger.info("api.startup", version=__version__, prefix=settings.app_api_prefix)
# Best-effort bootstrap of MinIO buckets - non-fatal if it fails (health will reflect).
try:
from app.storage.minio_client import get_storage
get_storage().ensure_buckets()
except Exception as exc: # noqa: BLE001
logger.warning("api.startup.minio_bootstrap_failed", error=str(exc))
yield
logger.info("api.shutdown")
app = FastAPI(
title="LegacyHUB",
description="Hybrid lexical + semantic search over legacy PDF archives",
version=__version__,
lifespan=lifespan,
)
app.include_router(routes_health.router, prefix=settings.app_api_prefix)
app.include_router(routes_ingestion.router, prefix=settings.app_api_prefix)
app.include_router(routes_search.router, prefix=settings.app_api_prefix)
@app.get("/")
def root() -> dict[str, str]:
return {
"service": "LegacyHUB",
"version": __version__,
"api": settings.app_api_prefix,
"docs": "/docs",
}

3
app/storage/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from app.storage.minio_client import MinioStorage, get_storage
__all__ = ["MinioStorage", "get_storage"]

View File

@@ -0,0 +1,42 @@
"""Storage key conventions for MinIO and local working paths."""
from __future__ import annotations
import uuid
from pathlib import Path
from app.config import settings
def work_dir_for(document_id: uuid.UUID | str) -> Path:
p = Path(settings.app_work_dir) / str(document_id)
p.mkdir(parents=True, exist_ok=True)
return p
def key_original_pdf(document_id: uuid.UUID | str, sha256: str) -> str:
return f"docs/{document_id}/original/{sha256}.pdf"
def key_ocr_pdf(document_id: uuid.UUID | str) -> str:
return f"docs/{document_id}/ocr/ocr.pdf"
def key_docling_json(document_id: uuid.UUID | str) -> str:
return f"docs/{document_id}/docling/document.json"
def key_markdown(document_id: uuid.UUID | str) -> str:
return f"docs/{document_id}/docling/document.md"
def key_page_image(document_id: uuid.UUID | str, page_number: int) -> str:
return f"docs/{document_id}/pages/p{page_number:05d}.png"
def key_figure_crop(document_id: uuid.UUID | str, page_number: int, figure_index: int) -> str:
return f"docs/{document_id}/figures/p{page_number:05d}_f{figure_index:03d}.png"
def key_table_json(document_id: uuid.UUID | str, table_index: int) -> str:
return f"docs/{document_id}/tables/t{table_index:04d}.json"

110
app/storage/minio_client.py Normal file
View File

@@ -0,0 +1,110 @@
"""Thin wrapper around the MinIO Python SDK with bucket bootstrap and retries."""
from __future__ import annotations
import io
from functools import lru_cache
from pathlib import Path
from typing import Any
from minio import Minio
from minio.error import S3Error
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
from app.config import settings
from app.logging_config import get_logger
logger = get_logger(__name__)
class MinioStorage:
def __init__(self, client: Minio | None = None) -> None:
self.client = client or Minio(
endpoint=settings.minio_endpoint,
access_key=settings.minio_access_key,
secret_key=settings.minio_secret_key,
secure=settings.minio_secure,
region=settings.minio_region,
)
self.originals_bucket = settings.minio_bucket_originals
self.derived_bucket = settings.minio_bucket_derived
def ensure_buckets(self) -> None:
for bucket in (self.originals_bucket, self.derived_bucket):
if not self.client.bucket_exists(bucket):
logger.info("minio.create_bucket", bucket=bucket)
self.client.make_bucket(bucket)
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
retry=retry_if_exception_type(S3Error),
reraise=True,
)
def put_file(
self,
bucket: str,
key: str,
path: Path,
content_type: str = "application/octet-stream",
metadata: dict[str, str] | None = None,
) -> None:
size = path.stat().st_size
with path.open("rb") as f:
self.client.put_object(
bucket_name=bucket,
object_name=key,
data=f,
length=size,
content_type=content_type,
metadata=metadata or {},
)
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
retry=retry_if_exception_type(S3Error),
reraise=True,
)
def put_bytes(
self,
bucket: str,
key: str,
data: bytes,
content_type: str = "application/octet-stream",
metadata: dict[str, str] | None = None,
) -> None:
self.client.put_object(
bucket_name=bucket,
object_name=key,
data=io.BytesIO(data),
length=len(data),
content_type=content_type,
metadata=metadata or {},
)
def get_to_path(self, bucket: str, key: str, dest: Path) -> Path:
dest.parent.mkdir(parents=True, exist_ok=True)
self.client.fget_object(bucket, key, str(dest))
return dest
def exists(self, bucket: str, key: str) -> bool:
try:
self.client.stat_object(bucket, key)
return True
except S3Error as exc:
if exc.code in {"NoSuchKey", "NoSuchObject"}:
return False
raise
def health(self) -> dict[str, Any]:
try:
buckets = [b.name for b in self.client.list_buckets()]
return {"status": "ok", "buckets": buckets}
except Exception as exc:
return {"status": "error", "error": str(exc)}
@lru_cache(maxsize=1)
def get_storage() -> MinioStorage:
return MinioStorage()

0
app/utils/__init__.py Normal file
View File

21
app/utils/hashing.py Normal file
View File

@@ -0,0 +1,21 @@
"""Streaming SHA256 hashing utilities for large files."""
from __future__ import annotations
import hashlib
from pathlib import Path
_CHUNK = 1024 * 1024 # 1 MiB
def sha256_file(path: Path | str) -> str:
"""Compute SHA256 of a file in streaming mode (constant memory)."""
h = hashlib.sha256()
with open(path, "rb") as f:
for block in iter(lambda: f.read(_CHUNK), b""):
h.update(block)
return h.hexdigest()
def sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()

24
app/utils/language.py Normal file
View File

@@ -0,0 +1,24 @@
"""Language detection helper - tolerant to short / mixed text."""
from __future__ import annotations
from langdetect import DetectorFactory, LangDetectException, detect_langs
DetectorFactory.seed = 42
def detect_language(text: str, min_chars: int = 40) -> str | None:
"""Return ISO 639-1 language code or ``None`` if undetectable."""
if not text or len(text.strip()) < min_chars:
return None
try:
ranked = detect_langs(text)
except LangDetectException:
return None
if not ranked:
return None
return ranked[0].lang
def has_cyrillic(text: str) -> bool:
return any("Ѐ" <= ch <= "ӿ" for ch in text)

36
app/utils/pdf.py Normal file
View File

@@ -0,0 +1,36 @@
"""PDF inspection helpers - decide whether OCR is required."""
from __future__ import annotations
from pathlib import Path
import pikepdf
from pdfminer.high_level import extract_text
def page_count(path: Path | str) -> int:
with pikepdf.open(str(path)) as pdf:
return len(pdf.pages)
def has_searchable_text(path: Path | str, sample_pages: int = 3, min_chars: int = 80) -> bool:
"""Cheap check: extract text from first ``sample_pages`` and require ``min_chars``.
Returns False on any extraction error - safer to OCR than to skip.
"""
try:
text = extract_text(str(path), maxpages=sample_pages) or ""
except Exception:
return False
return len(text.strip()) >= min_chars
def is_pdf(path: Path | str) -> bool:
p = Path(path)
if not p.is_file() or p.suffix.lower() != ".pdf":
return False
try:
with open(p, "rb") as f:
return f.read(5) == b"%PDF-"
except OSError:
return False

View File

@@ -0,0 +1,69 @@
"""Conservative OCR text cleaning.
Goals:
- Drop hyphenation across line breaks (``инвен-\\арный`` -> ``инвентарный``).
- Collapse runs of whitespace.
- Strip control chars.
- Preserve all non-letter characters that may carry meaning in legacy/technical
documents: digits, punctuation, slashes, dashes, dots, parentheses, etc.
We do NOT lowercase, transliterate, or strip punctuation here. ``normalize_for_search``
produces a more aggressive form for indexing, but the original ``text`` is always
kept untouched for citation/display.
"""
from __future__ import annotations
import re
import unicodedata
_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
_SOFT_HYPHEN = "­"
_MULTI_WS = re.compile(r"[ \t ]+")
_MULTI_NL = re.compile(r"\n{3,}")
_HYPHEN_LINEBREAK = re.compile(r"(\w)[-]\n(\w)")
_TRAILING_WS = re.compile(r"[ \t]+\n")
def clean_ocr_text(text: str) -> str:
if not text:
return ""
# Normalize unicode (NFC) to merge combining marks.
text = unicodedata.normalize("NFC", text)
text = text.replace(_SOFT_HYPHEN, "")
text = _CONTROL_CHARS.sub("", text)
text = _HYPHEN_LINEBREAK.sub(r"\1\2", text)
text = _TRAILING_WS.sub("\n", text)
text = _MULTI_WS.sub(" ", text)
text = _MULTI_NL.sub("\n\n", text)
return text.strip()
_PUNCT_RUN = re.compile(r"[^\w\s/\-.,№#:()\[\]]+", flags=re.UNICODE)
_WS_RUN = re.compile(r"\s+")
def normalize_for_search(text: str) -> str:
"""Lowercase + light normalization for full-text indexing.
Preserves digits, alphanumerics, slashes, dashes, dots, commas, ``№``, ``#``,
colons and brackets - all of which appear in document/serial/standard codes.
"""
if not text:
return ""
text = clean_ocr_text(text)
text = text.lower()
text = _PUNCT_RUN.sub(" ", text)
text = _WS_RUN.sub(" ", text)
return text.strip()
def looks_garbled(text: str, threshold: float = 0.35) -> bool:
"""Heuristic: ratio of non-alphanumeric, non-whitespace chars."""
if not text:
return False
total = len(text)
if total < 20:
return False
bad = sum(1 for c in text if not (c.isalnum() or c.isspace() or c in ".,;:!?-/()[]№#"))
return (bad / total) > threshold

0
app/workers/__init__.py Normal file
View File

28
app/workers/celery_app.py Normal file
View File

@@ -0,0 +1,28 @@
"""Celery application instance."""
from __future__ import annotations
from celery import Celery
from app.config import settings
from app.logging_config import configure_logging
configure_logging()
celery_app = Celery(
"legacyhub",
broker=settings.redis_url,
backend=settings.redis_url,
include=["app.workers.tasks"],
)
celery_app.conf.update(
task_acks_late=True,
task_reject_on_worker_lost=True,
task_track_started=True,
worker_prefetch_multiplier=1,
task_time_limit=settings.max_document_timeout_seconds * 4,
task_soft_time_limit=settings.max_document_timeout_seconds * 3,
timezone="UTC",
enable_utc=True,
)

22
app/workers/tasks.py Normal file
View File

@@ -0,0 +1,22 @@
"""Celery tasks - thin wrappers over pipeline functions."""
from __future__ import annotations
import uuid
from celery.utils.log import get_task_logger
from app.workers.celery_app import celery_app
logger = get_task_logger(__name__)
@celery_app.task(name="legacyhub.process_document", bind=True, max_retries=2, default_retry_delay=30)
def process_document(self, document_id: str, run_id: str | None = None) -> dict:
from app.ingestion.pipeline import process_document_id
try:
return process_document_id(uuid.UUID(document_id), uuid.UUID(run_id) if run_id else None)
except Exception as exc: # noqa: BLE001
logger.exception("worker.process_failed", extra={"document_id": document_id})
raise self.retry(exc=exc) from exc