chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
3
app/__init__.py
Normal file
3
app/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""LegacyHUB - knowledge indexing and hybrid search over legacy PDF archives."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
96
app/api/routes_health.py
Normal file
96
app/api/routes_health.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Health endpoint - probes Postgres, MinIO, OpenSearch, Qdrant, Redis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter
|
||||
from sqlalchemy import text
|
||||
|
||||
from app import __version__
|
||||
from app.api.schemas import ComponentHealth, HealthResponse
|
||||
from app.config import settings
|
||||
from app.db.session import get_engine
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.minio_client import get_storage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(tags=["health"])
|
||||
|
||||
|
||||
def _check_postgres() -> ComponentHealth:
|
||||
try:
|
||||
with get_engine().connect() as conn:
|
||||
conn.execute(text("SELECT 1"))
|
||||
return ComponentHealth(name="postgres", status="ok")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="postgres", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_minio() -> ComponentHealth:
|
||||
info: dict[str, Any] = get_storage().health()
|
||||
if info.get("status") == "ok":
|
||||
return ComponentHealth(name="minio", status="ok", detail=info)
|
||||
return ComponentHealth(name="minio", status="error", detail=info)
|
||||
|
||||
|
||||
def _check_opensearch() -> ComponentHealth:
|
||||
try:
|
||||
from app.indexing.opensearch_client import get_opensearch
|
||||
|
||||
client = get_opensearch()
|
||||
info = client.cluster.health()
|
||||
cluster_status = info.get("status")
|
||||
status = "ok" if cluster_status in ("green", "yellow") else "degraded"
|
||||
return ComponentHealth(
|
||||
name="opensearch",
|
||||
status=status, # type: ignore[arg-type]
|
||||
detail={"cluster_status": cluster_status, "nodes": info.get("number_of_nodes")},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="opensearch", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_qdrant() -> ComponentHealth:
|
||||
try:
|
||||
from app.indexing.qdrant_client import get_qdrant
|
||||
|
||||
client = get_qdrant()
|
||||
cols = client.get_collections()
|
||||
return ComponentHealth(
|
||||
name="qdrant",
|
||||
status="ok",
|
||||
detail={"collections": [c.name for c in cols.collections]},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="qdrant", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
def _check_redis() -> ComponentHealth:
|
||||
try:
|
||||
import redis
|
||||
|
||||
r = redis.Redis.from_url(settings.redis_url, socket_connect_timeout=2)
|
||||
r.ping()
|
||||
return ComponentHealth(name="redis", status="ok")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return ComponentHealth(name="redis", status="error", detail={"error": str(exc)})
|
||||
|
||||
|
||||
@router.get("/health", response_model=HealthResponse)
|
||||
def health() -> HealthResponse:
|
||||
components = [
|
||||
_check_postgres(),
|
||||
_check_minio(),
|
||||
_check_opensearch(),
|
||||
_check_qdrant(),
|
||||
_check_redis(),
|
||||
]
|
||||
if any(c.status == "error" for c in components):
|
||||
overall = "error"
|
||||
elif any(c.status == "degraded" for c in components):
|
||||
overall = "degraded"
|
||||
else:
|
||||
overall = "ok"
|
||||
return HealthResponse(status=overall, version=__version__, components=components) # type: ignore[arg-type]
|
||||
63
app/api/routes_ingestion.py
Normal file
63
app/api/routes_ingestion.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""Ingestion endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from app.api.schemas import IngestFolderRequest, IngestFolderResponse
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/ingest", tags=["ingestion"])
|
||||
|
||||
|
||||
@router.post("/folder", response_model=IngestFolderResponse)
|
||||
def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
|
||||
"""Discover all PDFs under ``path`` and queue them for processing.
|
||||
|
||||
The request returns immediately after the discovery pass. Per-document
|
||||
OCR / extraction / indexing happens asynchronously in Celery workers.
|
||||
"""
|
||||
folder = Path(req.path)
|
||||
if not folder.exists() or not folder.is_dir():
|
||||
raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
|
||||
|
||||
# Lazy import - keeps module load light.
|
||||
from app.ingestion.scanner import discover_documents
|
||||
from app.workers.tasks import process_document
|
||||
|
||||
run_id = uuid.uuid4()
|
||||
discovered, queued, dups, invalid = 0, 0, 0, 0
|
||||
|
||||
for record in discover_documents(folder, recursive=req.recursive, force=req.force):
|
||||
discovered += 1
|
||||
if record.duplicate and not req.force:
|
||||
dups += 1
|
||||
continue
|
||||
if not record.document_id:
|
||||
invalid += 1
|
||||
continue
|
||||
process_document.delay(str(record.document_id), str(run_id))
|
||||
queued += 1
|
||||
|
||||
logger.info(
|
||||
"ingest.folder.queued",
|
||||
path=str(folder),
|
||||
discovered=discovered,
|
||||
queued=queued,
|
||||
skipped_duplicates=dups,
|
||||
invalid=invalid,
|
||||
run_id=str(run_id),
|
||||
)
|
||||
|
||||
return IngestFolderResponse(
|
||||
run_id=run_id,
|
||||
discovered=discovered,
|
||||
queued=queued,
|
||||
skipped_duplicates=dups,
|
||||
invalid_files=invalid,
|
||||
)
|
||||
16
app/api/routes_search.py
Normal file
16
app/api/routes_search.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""Search endpoint - lexical / semantic / hybrid."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.api.schemas import SearchRequest, SearchResponse
|
||||
|
||||
router = APIRouter(prefix="/search", tags=["search"])
|
||||
|
||||
|
||||
@router.post("", response_model=SearchResponse)
|
||||
def search(req: SearchRequest) -> SearchResponse:
|
||||
from app.indexing.hybrid_search import run_search
|
||||
|
||||
return run_search(req)
|
||||
99
app/api/schemas.py
Normal file
99
app/api/schemas.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Pydantic request/response schemas for the LegacyHUB API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ---------------- Health ----------------
|
||||
|
||||
class ComponentHealth(BaseModel):
|
||||
name: str
|
||||
status: Literal["ok", "error", "degraded"]
|
||||
detail: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: Literal["ok", "error", "degraded"]
|
||||
version: str
|
||||
components: list[ComponentHealth]
|
||||
|
||||
|
||||
# ---------------- Ingestion ----------------
|
||||
|
||||
class IngestFolderRequest(BaseModel):
|
||||
path: str = Field(..., description="Absolute path inside the API container")
|
||||
recursive: bool = True
|
||||
force: bool = False
|
||||
|
||||
|
||||
class IngestFolderResponse(BaseModel):
|
||||
run_id: uuid.UUID
|
||||
discovered: int
|
||||
queued: int
|
||||
skipped_duplicates: int
|
||||
invalid_files: int
|
||||
|
||||
|
||||
class DocumentSummary(BaseModel):
|
||||
id: uuid.UUID
|
||||
original_file_name: str
|
||||
source_path: str
|
||||
sha256: str
|
||||
status: str
|
||||
file_size_bytes: int
|
||||
created_at: datetime
|
||||
|
||||
|
||||
# ---------------- Search ----------------
|
||||
|
||||
SearchMode = Literal["lexical", "semantic", "hybrid"]
|
||||
|
||||
|
||||
class SearchFilters(BaseModel):
|
||||
document_id: uuid.UUID | None = None
|
||||
source_path: str | None = None
|
||||
block_type: str | None = None
|
||||
min_ocr_confidence: float | None = Field(None, ge=0.0, le=1.0)
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str = Field(..., min_length=1)
|
||||
limit: int = Field(10, ge=1, le=100)
|
||||
filters: SearchFilters = Field(default_factory=SearchFilters)
|
||||
search_mode: SearchMode = "hybrid"
|
||||
|
||||
|
||||
class Citation(BaseModel):
|
||||
pdf: str
|
||||
page: int
|
||||
block_id: str | None = None
|
||||
table_id: str | None = None
|
||||
figure_id: str | None = None
|
||||
|
||||
|
||||
class SearchHit(BaseModel):
|
||||
rank: int
|
||||
score: float
|
||||
document_id: uuid.UUID
|
||||
chunk_id: uuid.UUID
|
||||
original_file_name: str
|
||||
source_path: str
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
citation: Citation
|
||||
quality_flags: dict[str, Any] = Field(default_factory=dict)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
query: str
|
||||
mode: SearchMode
|
||||
total_candidates: int
|
||||
reranked: bool
|
||||
results: list[SearchHit]
|
||||
111
app/config.py
Normal file
111
app/config.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Centralized typed configuration loaded from environment variables.
|
||||
|
||||
All other modules import :data:`settings` and never touch ``os.environ`` directly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
case_sensitive=False,
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
# ---------------- App ----------------
|
||||
app_log_level: str = Field("INFO", alias="APP_LOG_LEVEL")
|
||||
app_host: str = Field("0.0.0.0", alias="APP_HOST")
|
||||
app_port: int = Field(8000, alias="APP_PORT")
|
||||
app_input_dir: str = Field("/data/input", alias="APP_INPUT_DIR")
|
||||
app_work_dir: str = Field("/data/work", alias="APP_WORK_DIR")
|
||||
app_api_prefix: str = Field("/api/v1", alias="APP_API_PREFIX")
|
||||
|
||||
# ---------------- Postgres ----------------
|
||||
postgres_host: str = Field("postgres", alias="POSTGRES_HOST")
|
||||
postgres_port: int = Field(5432, alias="POSTGRES_PORT")
|
||||
postgres_db: str = Field("legacyhub", alias="POSTGRES_DB")
|
||||
postgres_user: str = Field("legacyhub", alias="POSTGRES_USER")
|
||||
postgres_password: str = Field("legacyhub", alias="POSTGRES_PASSWORD")
|
||||
|
||||
@property
|
||||
def database_url(self) -> str:
|
||||
return (
|
||||
f"postgresql+psycopg://{self.postgres_user}:{self.postgres_password}"
|
||||
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
|
||||
)
|
||||
|
||||
# ---------------- MinIO ----------------
|
||||
minio_endpoint: str = Field("minio:9000", alias="MINIO_ENDPOINT")
|
||||
minio_access_key: str = Field("legacyhub", alias="MINIO_ACCESS_KEY")
|
||||
minio_secret_key: str = Field("legacyhub-secret", alias="MINIO_SECRET_KEY")
|
||||
minio_bucket_originals: str = Field("legacyhub-originals", alias="MINIO_BUCKET_ORIGINALS")
|
||||
minio_bucket_derived: str = Field("legacyhub-derived", alias="MINIO_BUCKET_DERIVED")
|
||||
minio_secure: bool = Field(False, alias="MINIO_SECURE")
|
||||
minio_region: str = Field("us-east-1", alias="MINIO_REGION")
|
||||
|
||||
# ---------------- OpenSearch ----------------
|
||||
opensearch_host: str = Field("opensearch", alias="OPENSEARCH_HOST")
|
||||
opensearch_port: int = Field(9200, alias="OPENSEARCH_PORT")
|
||||
opensearch_use_ssl: bool = Field(False, alias="OPENSEARCH_USE_SSL")
|
||||
opensearch_verify_certs: bool = Field(False, alias="OPENSEARCH_VERIFY_CERTS")
|
||||
opensearch_user: str = Field("", alias="OPENSEARCH_USER")
|
||||
opensearch_password: str = Field("", alias="OPENSEARCH_PASSWORD")
|
||||
opensearch_index_chunks: str = Field("legacy_chunks", alias="OPENSEARCH_INDEX_CHUNKS")
|
||||
|
||||
# ---------------- Qdrant ----------------
|
||||
qdrant_host: str = Field("qdrant", alias="QDRANT_HOST")
|
||||
qdrant_port: int = Field(6333, alias="QDRANT_PORT")
|
||||
qdrant_api_key: str = Field("", alias="QDRANT_API_KEY")
|
||||
qdrant_collection_chunks: str = Field("legacy_chunks", alias="QDRANT_COLLECTION_CHUNKS")
|
||||
|
||||
# ---------------- Redis ----------------
|
||||
redis_url: str = Field("redis://redis:6379/0", alias="REDIS_URL")
|
||||
|
||||
# ---------------- OCR ----------------
|
||||
ocr_languages: str = Field("rus+eng", alias="OCR_LANGUAGES")
|
||||
ocr_enabled: bool = Field(True, alias="OCR_ENABLED")
|
||||
docling_ocr_enabled: bool = Field(False, alias="DOCLING_OCR_ENABLED")
|
||||
max_document_timeout_seconds: int = Field(180, alias="MAX_DOCUMENT_TIMEOUT_SECONDS")
|
||||
ocr_deskew: bool = Field(True, alias="OCR_DESKEW")
|
||||
ocr_clean: bool = Field(True, alias="OCR_CLEAN")
|
||||
ocr_optimize: int = Field(1, alias="OCR_OPTIMIZE")
|
||||
|
||||
# ---------------- Embeddings / Reranker ----------------
|
||||
embedding_model: str = Field("BAAI/bge-m3", alias="EMBEDDING_MODEL")
|
||||
embedding_dim: int = Field(1024, alias="EMBEDDING_DIM")
|
||||
embedding_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="EMBEDDING_DEVICE")
|
||||
embedding_batch_size: int = Field(8, alias="EMBEDDING_BATCH_SIZE")
|
||||
embedding_normalize: bool = Field(True, alias="EMBEDDING_NORMALIZE")
|
||||
|
||||
reranker_model: str = Field("BAAI/bge-reranker-v2-m3", alias="RERANKER_MODEL")
|
||||
reranker_device: Literal["cpu", "cuda", "mps"] = Field("cpu", alias="RERANKER_DEVICE")
|
||||
reranker_enabled: bool = Field(True, alias="RERANKER_ENABLED")
|
||||
reranker_batch_size: int = Field(8, alias="RERANKER_BATCH_SIZE")
|
||||
|
||||
# ---------------- Chunking ----------------
|
||||
chunk_target_tokens: int = Field(700, alias="CHUNK_TARGET_TOKENS")
|
||||
chunk_min_tokens: int = Field(120, alias="CHUNK_MIN_TOKENS")
|
||||
chunk_max_tokens: int = Field(900, alias="CHUNK_MAX_TOKENS")
|
||||
chunk_overlap_tokens: int = Field(100, alias="CHUNK_OVERLAP_TOKENS")
|
||||
|
||||
# ---------------- Hybrid search ----------------
|
||||
hybrid_opensearch_top_k: int = Field(50, alias="HYBRID_OPENSEARCH_TOP_K")
|
||||
hybrid_qdrant_top_k: int = Field(50, alias="HYBRID_QDRANT_TOP_K")
|
||||
hybrid_rrf_k: int = Field(60, alias="HYBRID_RRF_K")
|
||||
rerank_candidates: int = Field(40, alias="RERANK_CANDIDATES")
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_settings() -> Settings:
|
||||
return Settings() # type: ignore[call-arg]
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
3
app/db/__init__.py
Normal file
3
app/db/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.db.models import Base
|
||||
|
||||
__all__ = ["Base"]
|
||||
55
app/db/migrations/env.py
Normal file
55
app/db/migrations/env.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Alembic environment - online & offline migrations using app config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy import engine_from_config, pool
|
||||
|
||||
from app.config import settings
|
||||
from app.db.models import Base
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
context.configure(
|
||||
url=settings.database_url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
section = config.get_section(config.config_ini_section, {})
|
||||
section["sqlalchemy.url"] = settings.database_url
|
||||
connectable = engine_from_config(
|
||||
section,
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
27
app/db/migrations/script.py.mako
Normal file
27
app/db/migrations/script.py.mako
Normal file
@@ -0,0 +1,27 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: str | None = ${repr(down_revision)}
|
||||
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
|
||||
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
171
app/db/migrations/versions/0001_initial.py
Normal file
171
app/db/migrations/versions/0001_initial.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""initial schema
|
||||
|
||||
Revision ID: 0001_initial
|
||||
Revises:
|
||||
Create Date: 2026-05-10
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0001_initial"
|
||||
down_revision: str | None = None
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"documents",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("source_path", sa.Text, nullable=False),
|
||||
sa.Column("original_file_name", sa.Text, nullable=False),
|
||||
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
|
||||
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
|
||||
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
|
||||
sa.Column("language_hint", sa.Text, nullable=True),
|
||||
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
|
||||
sa.Column("error_message", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_documents_status", "documents", ["status"])
|
||||
op.create_index("ix_documents_sha256", "documents", ["sha256"])
|
||||
|
||||
op.create_table(
|
||||
"document_artifacts",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("artifact_type", sa.String(64), nullable=False),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=False),
|
||||
sa.Column("storage_key", sa.Text, nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=True),
|
||||
sa.Column("checksum", sa.String(64), nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
|
||||
|
||||
op.create_table(
|
||||
"pages",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"chunks",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("block_id", sa.Text, nullable=True),
|
||||
sa.Column("chunk_index", sa.Integer, nullable=False),
|
||||
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
|
||||
sa.Column("text", sa.Text, nullable=False),
|
||||
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("token_count", sa.Integer, nullable=True),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
)
|
||||
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
|
||||
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
|
||||
|
||||
op.create_table(
|
||||
"tables",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("table_index", sa.Integer, nullable=False),
|
||||
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("csv_text", sa.Text, nullable=True),
|
||||
sa.Column("json_data", postgresql.JSONB, nullable=True),
|
||||
sa.Column("summary", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"figures",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("figure_index", sa.Integer, nullable=False),
|
||||
sa.Column("caption", sa.Text, nullable=True),
|
||||
sa.Column("description", sa.Text, nullable=True),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=True),
|
||||
sa.Column("storage_key", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"ingestion_runs",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
|
||||
sa.Column("source_folder", sa.Text, nullable=False),
|
||||
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"processing_events",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("stage", sa.String(64), nullable=False),
|
||||
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
|
||||
sa.Column("message", sa.Text, nullable=False),
|
||||
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_events_doc", "processing_events", ["document_id"])
|
||||
op.create_index("ix_events_run", "processing_events", ["run_id"])
|
||||
op.create_index("ix_events_stage", "processing_events", ["stage"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_events_stage", table_name="processing_events")
|
||||
op.drop_index("ix_events_run", table_name="processing_events")
|
||||
op.drop_index("ix_events_doc", table_name="processing_events")
|
||||
op.drop_table("processing_events")
|
||||
op.drop_table("ingestion_runs")
|
||||
op.drop_table("figures")
|
||||
op.drop_table("tables")
|
||||
op.drop_index("ix_chunks_block_type", table_name="chunks")
|
||||
op.drop_index("ix_chunks_doc_page", table_name="chunks")
|
||||
op.drop_table("chunks")
|
||||
op.drop_table("pages")
|
||||
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
|
||||
op.drop_table("document_artifacts")
|
||||
op.drop_index("ix_documents_sha256", table_name="documents")
|
||||
op.drop_index("ix_documents_status", table_name="documents")
|
||||
op.drop_table("documents")
|
||||
266
app/db/models.py
Normal file
266
app/db/models.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""SQLAlchemy ORM models for LegacyHUB."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import (
|
||||
BigInteger,
|
||||
Boolean,
|
||||
DateTime,
|
||||
Float,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
func,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
|
||||
|
||||
class DocumentStatus:
|
||||
DISCOVERED = "DISCOVERED"
|
||||
STORED_ORIGINAL = "STORED_ORIGINAL"
|
||||
OCR_STARTED = "OCR_STARTED"
|
||||
OCR_COMPLETED = "OCR_COMPLETED"
|
||||
OCR_FAILED = "OCR_FAILED"
|
||||
EXTRACTION_STARTED = "EXTRACTION_STARTED"
|
||||
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
|
||||
EXTRACTION_FAILED = "EXTRACTION_FAILED"
|
||||
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
|
||||
INDEXING_COMPLETED = "INDEXING_COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class ArtifactType:
|
||||
ORIGINAL_PDF = "original_pdf"
|
||||
OCR_PDF = "ocr_pdf"
|
||||
DOCLING_JSON = "docling_json"
|
||||
MARKDOWN = "markdown"
|
||||
PAGE_IMAGE = "page_image"
|
||||
FIGURE_CROP = "figure_crop"
|
||||
TABLE_JSON = "table_json"
|
||||
|
||||
|
||||
class BlockType:
|
||||
TITLE = "title"
|
||||
HEADING = "heading"
|
||||
PARAGRAPH = "paragraph"
|
||||
LIST = "list"
|
||||
TABLE = "table"
|
||||
FIGURE_CAPTION = "figure_caption"
|
||||
FIGURE_DESCRIPTION = "figure_description"
|
||||
HANDWRITING = "handwriting"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
# ---- Tables ----
|
||||
|
||||
class Document(Base):
|
||||
__tablename__ = "documents"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
|
||||
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
|
||||
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
status: Mapped[str] = mapped_column(
|
||||
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
|
||||
)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
|
||||
)
|
||||
|
||||
artifacts: Mapped[list[DocumentArtifact]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
pages: Mapped[list[Page]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
chunks: Mapped[list[Chunk]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
|
||||
class DocumentArtifact(Base):
|
||||
__tablename__ = "document_artifacts"
|
||||
__table_args__ = (
|
||||
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="artifacts")
|
||||
|
||||
|
||||
class Page(Base):
|
||||
__tablename__ = "pages"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="pages")
|
||||
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
|
||||
|
||||
|
||||
class Chunk(Base):
|
||||
__tablename__ = "chunks"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
Index("ix_chunks_doc_page", "document_id", "page_number"),
|
||||
Index("ix_chunks_block_type", "block_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="chunks")
|
||||
page: Mapped[Page | None] = relationship(back_populates="chunks")
|
||||
|
||||
|
||||
class Table(Base):
|
||||
__tablename__ = "tables"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
|
||||
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class Figure(Base):
|
||||
__tablename__ = "figures"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class IngestionRun(Base):
|
||||
__tablename__ = "ingestion_runs"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
started_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
|
||||
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
run_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
|
||||
|
||||
class ProcessingEvent(Base):
|
||||
__tablename__ = "processing_events"
|
||||
__table_args__ = (
|
||||
Index("ix_events_doc", "document_id"),
|
||||
Index("ix_events_run", "run_id"),
|
||||
Index("ix_events_stage", "stage"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
stage: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
|
||||
message: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
66
app/db/session.py
Normal file
66
app/db/session.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""SQLAlchemy engine and session factory."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from app.config import settings
|
||||
|
||||
_engine: Engine | None = None
|
||||
_SessionFactory: sessionmaker[Session] | None = None
|
||||
|
||||
|
||||
def get_engine() -> Engine:
|
||||
global _engine
|
||||
if _engine is None:
|
||||
_engine = create_engine(
|
||||
settings.database_url,
|
||||
pool_pre_ping=True,
|
||||
pool_size=10,
|
||||
max_overflow=20,
|
||||
future=True,
|
||||
)
|
||||
return _engine
|
||||
|
||||
|
||||
def get_session_factory() -> sessionmaker[Session]:
|
||||
global _SessionFactory
|
||||
if _SessionFactory is None:
|
||||
_SessionFactory = sessionmaker(
|
||||
bind=get_engine(),
|
||||
autoflush=False,
|
||||
autocommit=False,
|
||||
expire_on_commit=False,
|
||||
future=True,
|
||||
)
|
||||
return _SessionFactory
|
||||
|
||||
|
||||
@contextmanager
|
||||
def session_scope() -> Iterator[Session]:
|
||||
"""Provide a transactional scope: commits on success, rolls back on error."""
|
||||
factory = get_session_factory()
|
||||
session = factory()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def get_db() -> Iterator[Session]:
|
||||
"""FastAPI dependency."""
|
||||
factory = get_session_factory()
|
||||
session = factory()
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
session.close()
|
||||
0
app/indexing/__init__.py
Normal file
0
app/indexing/__init__.py
Normal file
90
app/indexing/embeddings.py
Normal file
90
app/indexing/embeddings.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""BGE-M3 dense embedder with batching and CPU/GPU support.
|
||||
|
||||
We prefer FlagEmbedding's ``BGEM3FlagModel`` because it is the canonical
|
||||
implementation and supports dense + sparse output. We fall back to
|
||||
``sentence-transformers`` for portability.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class Embedder:
|
||||
def __init__(self, model_name: str, device: str, normalize: bool, batch_size: int) -> None:
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.normalize = normalize
|
||||
self.batch_size = batch_size
|
||||
self._impl = "flagembedding"
|
||||
self._model = None
|
||||
self._st_model = None
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
try:
|
||||
from FlagEmbedding import BGEM3FlagModel # type: ignore
|
||||
use_fp16 = self.device != "cpu"
|
||||
self._model = BGEM3FlagModel(self.model_name, use_fp16=use_fp16, devices=self.device)
|
||||
self._impl = "flagembedding"
|
||||
logger.info("embedder.loaded", impl="flagembedding", model=self.model_name, device=self.device)
|
||||
return
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("embedder.flagembedding_failed", error=str(exc))
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self._st_model = SentenceTransformer(self.model_name, device=self.device)
|
||||
self._impl = "sentence-transformers"
|
||||
logger.info("embedder.loaded", impl="sentence-transformers", model=self.model_name, device=self.device)
|
||||
|
||||
def encode(self, texts: Sequence[str]) -> list[list[float]]:
|
||||
if not texts:
|
||||
return []
|
||||
if self._impl == "flagembedding":
|
||||
out = self._model.encode( # type: ignore[union-attr]
|
||||
list(texts),
|
||||
batch_size=self.batch_size,
|
||||
max_length=8192,
|
||||
return_dense=True,
|
||||
return_sparse=False,
|
||||
return_colbert_vecs=False,
|
||||
)
|
||||
dense = out["dense_vecs"] if isinstance(out, dict) else out
|
||||
arr = np.asarray(dense, dtype=np.float32)
|
||||
else:
|
||||
arr = self._st_model.encode( # type: ignore[union-attr]
|
||||
list(texts),
|
||||
batch_size=self.batch_size,
|
||||
normalize_embeddings=self.normalize,
|
||||
convert_to_numpy=True,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
arr = arr.astype(np.float32)
|
||||
|
||||
if self.normalize and self._impl == "flagembedding":
|
||||
norms = np.linalg.norm(arr, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1.0
|
||||
arr = arr / norms
|
||||
|
||||
return arr.tolist()
|
||||
|
||||
def encode_one(self, text: str) -> list[float]:
|
||||
return self.encode([text])[0]
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_embedder() -> Embedder:
|
||||
return Embedder(
|
||||
model_name=settings.embedding_model,
|
||||
device=settings.embedding_device,
|
||||
normalize=settings.embedding_normalize,
|
||||
batch_size=settings.embedding_batch_size,
|
||||
)
|
||||
327
app/indexing/hybrid_search.py
Normal file
327
app/indexing/hybrid_search.py
Normal file
@@ -0,0 +1,327 @@
|
||||
"""Hybrid search: lexical (OpenSearch BM25) + semantic (Qdrant) + RRF + reranker.
|
||||
|
||||
Always returns ``SearchResponse`` (never throws on missing index/collection -
|
||||
empty results are valid).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
from app.api.schemas import (
|
||||
Citation,
|
||||
SearchFilters,
|
||||
SearchHit,
|
||||
SearchMode,
|
||||
SearchRequest,
|
||||
SearchResponse,
|
||||
)
|
||||
from app.config import settings
|
||||
from app.indexing.embeddings import get_embedder
|
||||
from app.indexing.opensearch_client import get_opensearch
|
||||
from app.indexing.qdrant_client import DENSE_VECTOR_NAME, get_qdrant
|
||||
from app.indexing.reranker import get_reranker
|
||||
from app.logging_config import get_logger
|
||||
from app.utils.text_cleaning import normalize_for_search
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Candidate:
|
||||
chunk_id: str
|
||||
document_id: str
|
||||
page_number: int
|
||||
block_type: str
|
||||
block_id: str | None
|
||||
text: str
|
||||
source_path: str
|
||||
original_file_name: str
|
||||
quality_flags: dict[str, Any]
|
||||
metadata: dict[str, Any]
|
||||
bm25_score: float | None = None
|
||||
bm25_rank: int | None = None
|
||||
dense_score: float | None = None
|
||||
dense_rank: int | None = None
|
||||
|
||||
|
||||
def run_search(req: SearchRequest) -> SearchResponse:
|
||||
mode: SearchMode = req.search_mode
|
||||
filters = req.filters
|
||||
|
||||
lexical: list[_Candidate] = []
|
||||
semantic: list[_Candidate] = []
|
||||
|
||||
if mode in ("lexical", "hybrid"):
|
||||
try:
|
||||
lexical = _lexical_search(req.query, filters, settings.hybrid_opensearch_top_k)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("search.lexical_failed", error=str(exc))
|
||||
|
||||
if mode in ("semantic", "hybrid"):
|
||||
try:
|
||||
semantic = _semantic_search(req.query, filters, settings.hybrid_qdrant_top_k)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("search.semantic_failed", error=str(exc))
|
||||
|
||||
merged = _merge(lexical, semantic, mode)
|
||||
candidates = merged[: settings.rerank_candidates]
|
||||
|
||||
reranker = get_reranker()
|
||||
reranked_flag = False
|
||||
if settings.reranker_enabled and reranker.available and candidates:
|
||||
scores = reranker.score(req.query, [c.text for c in candidates])
|
||||
for c, s in zip(candidates, scores, strict=True):
|
||||
c.dense_score = s
|
||||
candidates.sort(key=lambda c: (c.dense_score or 0.0), reverse=True)
|
||||
reranked_flag = True
|
||||
|
||||
final = candidates[: req.limit]
|
||||
|
||||
hits: list[SearchHit] = []
|
||||
for rank, c in enumerate(final, start=1):
|
||||
score = (
|
||||
c.dense_score
|
||||
if reranked_flag
|
||||
else (c.dense_score if mode == "semantic" else c.bm25_score) or 0.0
|
||||
)
|
||||
hits.append(
|
||||
SearchHit(
|
||||
rank=rank,
|
||||
score=float(score),
|
||||
document_id=uuid.UUID(c.document_id),
|
||||
chunk_id=uuid.UUID(c.chunk_id),
|
||||
original_file_name=c.original_file_name,
|
||||
source_path=c.source_path,
|
||||
page_number=c.page_number,
|
||||
block_type=c.block_type,
|
||||
text=c.text,
|
||||
citation=Citation(
|
||||
pdf=c.original_file_name,
|
||||
page=c.page_number,
|
||||
block_id=c.block_id,
|
||||
table_id=str(c.metadata.get("table_index")) if c.metadata.get("table_index") is not None else None,
|
||||
figure_id=str(c.metadata.get("figure_index")) if c.metadata.get("figure_index") is not None else None,
|
||||
),
|
||||
quality_flags=c.quality_flags,
|
||||
metadata=c.metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return SearchResponse(
|
||||
query=req.query,
|
||||
mode=mode,
|
||||
total_candidates=len(merged),
|
||||
reranked=reranked_flag,
|
||||
results=hits,
|
||||
)
|
||||
|
||||
|
||||
# ---------------- lexical ----------------
|
||||
|
||||
def _lexical_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
|
||||
client = get_opensearch()
|
||||
if not client.indices.exists(index=settings.opensearch_index_chunks):
|
||||
return []
|
||||
|
||||
must = [
|
||||
{
|
||||
"multi_match": {
|
||||
"query": query,
|
||||
"fields": ["text^1.0", "text.ru^1.5", "text.en^1.5", "normalized_text^0.7"],
|
||||
"type": "best_fields",
|
||||
"operator": "or",
|
||||
}
|
||||
}
|
||||
]
|
||||
norm = normalize_for_search(query)
|
||||
if norm and norm != query.lower():
|
||||
must.append({"match": {"normalized_text": {"query": norm, "boost": 0.5}}})
|
||||
|
||||
filter_clauses = _opensearch_filters(filters)
|
||||
body = {
|
||||
"size": top_k,
|
||||
"query": {"bool": {"must": must, "filter": filter_clauses}},
|
||||
"_source": [
|
||||
"chunk_id",
|
||||
"document_id",
|
||||
"source_path",
|
||||
"original_file_name",
|
||||
"page_number",
|
||||
"block_type",
|
||||
"block_id",
|
||||
"text",
|
||||
"quality_flags",
|
||||
"metadata",
|
||||
],
|
||||
}
|
||||
res = client.search(index=settings.opensearch_index_chunks, body=body, request_timeout=30)
|
||||
out: list[_Candidate] = []
|
||||
for rank, hit in enumerate(res.get("hits", {}).get("hits", []), start=1):
|
||||
s = hit.get("_source", {})
|
||||
out.append(
|
||||
_Candidate(
|
||||
chunk_id=s["chunk_id"],
|
||||
document_id=s["document_id"],
|
||||
page_number=int(s.get("page_number", 0)),
|
||||
block_type=s.get("block_type", "paragraph"),
|
||||
block_id=s.get("block_id"),
|
||||
text=s.get("text", ""),
|
||||
source_path=s.get("source_path", ""),
|
||||
original_file_name=s.get("original_file_name", ""),
|
||||
quality_flags=s.get("quality_flags") or {},
|
||||
metadata=s.get("metadata") or {},
|
||||
bm25_score=float(hit.get("_score") or 0.0),
|
||||
bm25_rank=rank,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _opensearch_filters(filters: SearchFilters) -> list[dict[str, Any]]:
|
||||
clauses: list[dict[str, Any]] = []
|
||||
if filters.document_id:
|
||||
clauses.append({"term": {"document_id": str(filters.document_id)}})
|
||||
if filters.source_path:
|
||||
clauses.append({"term": {"source_path": filters.source_path}})
|
||||
if filters.block_type:
|
||||
clauses.append({"term": {"block_type": filters.block_type}})
|
||||
if filters.min_ocr_confidence is not None:
|
||||
clauses.append({"range": {"ocr_confidence": {"gte": filters.min_ocr_confidence}}})
|
||||
return clauses
|
||||
|
||||
|
||||
# ---------------- semantic ----------------
|
||||
|
||||
def _semantic_search(query: str, filters: SearchFilters, top_k: int) -> list[_Candidate]:
|
||||
embedder = get_embedder()
|
||||
vector = embedder.encode_one(query)
|
||||
qf = _qdrant_filter(filters)
|
||||
|
||||
client = get_qdrant()
|
||||
try:
|
||||
results = client.query_points(
|
||||
collection_name=settings.qdrant_collection_chunks,
|
||||
query=vector,
|
||||
using=DENSE_VECTOR_NAME,
|
||||
limit=top_k,
|
||||
with_payload=True,
|
||||
query_filter=qf,
|
||||
).points
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("qdrant.query_points_fallback", error=str(exc))
|
||||
results = client.search(
|
||||
collection_name=settings.qdrant_collection_chunks,
|
||||
query_vector=(DENSE_VECTOR_NAME, vector),
|
||||
query_filter=qf,
|
||||
limit=top_k,
|
||||
with_payload=True,
|
||||
)
|
||||
|
||||
out: list[_Candidate] = []
|
||||
for rank, p in enumerate(results, start=1):
|
||||
payload = p.payload or {}
|
||||
chunk_id = payload.get("chunk_id") or str(p.id)
|
||||
out.append(
|
||||
_Candidate(
|
||||
chunk_id=str(chunk_id),
|
||||
document_id=str(payload.get("document_id", "")),
|
||||
page_number=int(payload.get("page_number") or 0),
|
||||
block_type=payload.get("block_type", "paragraph"),
|
||||
block_id=payload.get("block_id"),
|
||||
text=payload.get("text_preview", ""),
|
||||
source_path=payload.get("source_path", ""),
|
||||
original_file_name=payload.get("original_file_name", ""),
|
||||
quality_flags=payload.get("quality_flags") or {},
|
||||
metadata=payload.get("metadata") or {},
|
||||
dense_score=float(p.score or 0.0),
|
||||
dense_rank=rank,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _qdrant_filter(filters: SearchFilters) -> qm.Filter | None:
|
||||
must: list[qm.FieldCondition | qm.Range] = []
|
||||
if filters.document_id:
|
||||
must.append(qm.FieldCondition(key="document_id", match=qm.MatchValue(value=str(filters.document_id))))
|
||||
if filters.source_path:
|
||||
must.append(qm.FieldCondition(key="source_path", match=qm.MatchValue(value=filters.source_path)))
|
||||
if filters.block_type:
|
||||
must.append(qm.FieldCondition(key="block_type", match=qm.MatchValue(value=filters.block_type)))
|
||||
if filters.min_ocr_confidence is not None:
|
||||
must.append(qm.FieldCondition(key="ocr_confidence", range=qm.Range(gte=filters.min_ocr_confidence)))
|
||||
if not must:
|
||||
return None
|
||||
return qm.Filter(must=must)
|
||||
|
||||
|
||||
# ---------------- merge ----------------
|
||||
|
||||
def _merge(lexical: list[_Candidate], semantic: list[_Candidate], mode: SearchMode) -> list[_Candidate]:
|
||||
if mode == "lexical":
|
||||
return lexical
|
||||
if mode == "semantic":
|
||||
return _hydrate_semantic_text(semantic)
|
||||
|
||||
by_id: dict[str, _Candidate] = {}
|
||||
for c in lexical:
|
||||
by_id[c.chunk_id] = c
|
||||
for c in semantic:
|
||||
if c.chunk_id in by_id:
|
||||
by_id[c.chunk_id].dense_score = c.dense_score
|
||||
by_id[c.chunk_id].dense_rank = c.dense_rank
|
||||
if not by_id[c.chunk_id].text:
|
||||
by_id[c.chunk_id].text = c.text
|
||||
else:
|
||||
by_id[c.chunk_id] = c
|
||||
|
||||
rrf: dict[str, float] = defaultdict(float)
|
||||
k = settings.hybrid_rrf_k
|
||||
for c in lexical:
|
||||
if c.bm25_rank is not None:
|
||||
rrf[c.chunk_id] += 1.0 / (k + c.bm25_rank)
|
||||
for c in semantic:
|
||||
if c.dense_rank is not None:
|
||||
rrf[c.chunk_id] += 1.0 / (k + c.dense_rank)
|
||||
|
||||
items = sorted(by_id.values(), key=lambda c: rrf.get(c.chunk_id, 0.0), reverse=True)
|
||||
return _hydrate_full_text(items)
|
||||
|
||||
|
||||
def _hydrate_full_text(candidates: list[_Candidate]) -> list[_Candidate]:
|
||||
"""For candidates whose text came only from Qdrant payload (preview), pull
|
||||
the full chunk text from OpenSearch by id so the reranker sees full content.
|
||||
"""
|
||||
missing = [c for c in candidates if len(c.text) <= 512]
|
||||
if not missing:
|
||||
return candidates
|
||||
client = get_opensearch()
|
||||
ids = [c.chunk_id for c in missing]
|
||||
try:
|
||||
res = client.mget(index=settings.opensearch_index_chunks, body={"ids": ids})
|
||||
except Exception:
|
||||
return candidates
|
||||
by_id = {d["_id"]: d.get("_source", {}) for d in res.get("docs", []) if d.get("found")}
|
||||
for c in missing:
|
||||
s = by_id.get(c.chunk_id)
|
||||
if s and s.get("text"):
|
||||
c.text = s["text"]
|
||||
if not c.original_file_name:
|
||||
c.original_file_name = s.get("original_file_name", "")
|
||||
if not c.source_path:
|
||||
c.source_path = s.get("source_path", "")
|
||||
if not c.metadata:
|
||||
c.metadata = s.get("metadata") or {}
|
||||
if not c.quality_flags:
|
||||
c.quality_flags = s.get("quality_flags") or {}
|
||||
return candidates
|
||||
|
||||
|
||||
def _hydrate_semantic_text(candidates: list[_Candidate]) -> list[_Candidate]:
|
||||
return _hydrate_full_text(candidates)
|
||||
142
app/indexing/opensearch_client.py
Normal file
142
app/indexing/opensearch_client.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""OpenSearch client + index bootstrap + chunk indexing helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Any, Iterable
|
||||
|
||||
from opensearchpy import OpenSearch, RequestsHttpConnection
|
||||
from opensearchpy.helpers import bulk
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Index settings: 3 analyzers (russian, english, standard).
|
||||
# We index ``text`` with multi-fields (.ru, .en, .raw) so we can boost per language at query time.
|
||||
INDEX_SETTINGS: dict[str, Any] = {
|
||||
"settings": {
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 0,
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"ru_stop": {"type": "stop", "stopwords": "_russian_"},
|
||||
"ru_stemmer": {"type": "stemmer", "language": "russian"},
|
||||
"en_stop": {"type": "stop", "stopwords": "_english_"},
|
||||
"en_stemmer": {"type": "stemmer", "language": "english"},
|
||||
},
|
||||
"analyzer": {
|
||||
"ru_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "ru_stop", "ru_stemmer"],
|
||||
},
|
||||
"en_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "en_stop", "en_stemmer"],
|
||||
},
|
||||
"code_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase"],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"mappings": {
|
||||
"dynamic": "strict",
|
||||
"properties": {
|
||||
"chunk_id": {"type": "keyword"},
|
||||
"document_id": {"type": "keyword"},
|
||||
"source_path": {"type": "keyword"},
|
||||
"original_file_name": {
|
||||
"type": "text",
|
||||
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
|
||||
},
|
||||
"page_number": {"type": "integer"},
|
||||
"block_type": {"type": "keyword"},
|
||||
"block_id": {"type": "keyword"},
|
||||
"text": {
|
||||
"type": "text",
|
||||
"analyzer": "code_analyzer",
|
||||
"fields": {
|
||||
"ru": {"type": "text", "analyzer": "ru_analyzer"},
|
||||
"en": {"type": "text", "analyzer": "en_analyzer"},
|
||||
},
|
||||
},
|
||||
"normalized_text": {
|
||||
"type": "text",
|
||||
"analyzer": "code_analyzer",
|
||||
},
|
||||
"ocr_confidence": {"type": "float"},
|
||||
"language_hint": {"type": "keyword"},
|
||||
"metadata": {"type": "object", "enabled": True},
|
||||
"quality_flags": {"type": "object", "enabled": True},
|
||||
"created_at": {"type": "date"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_opensearch() -> OpenSearch:
|
||||
auth = None
|
||||
if settings.opensearch_user and settings.opensearch_password:
|
||||
auth = (settings.opensearch_user, settings.opensearch_password)
|
||||
return OpenSearch(
|
||||
hosts=[{"host": settings.opensearch_host, "port": settings.opensearch_port}],
|
||||
http_auth=auth,
|
||||
use_ssl=settings.opensearch_use_ssl,
|
||||
verify_certs=settings.opensearch_verify_certs,
|
||||
ssl_show_warn=False,
|
||||
connection_class=RequestsHttpConnection,
|
||||
timeout=30,
|
||||
max_retries=3,
|
||||
retry_on_timeout=True,
|
||||
)
|
||||
|
||||
|
||||
def ensure_index(index: str | None = None) -> None:
|
||||
name = index or settings.opensearch_index_chunks
|
||||
client = get_opensearch()
|
||||
if client.indices.exists(index=name):
|
||||
logger.debug("opensearch.index.exists", index=name)
|
||||
return
|
||||
logger.info("opensearch.index.create", index=name)
|
||||
client.indices.create(index=name, body=INDEX_SETTINGS)
|
||||
|
||||
|
||||
def index_chunks(docs: Iterable[dict[str, Any]], index: str | None = None) -> tuple[int, int]:
|
||||
"""Bulk-upsert chunks. Returns (success, errors)."""
|
||||
name = index or settings.opensearch_index_chunks
|
||||
actions: list[dict[str, Any]] = []
|
||||
for d in docs:
|
||||
actions.append(
|
||||
{
|
||||
"_op_type": "index",
|
||||
"_index": name,
|
||||
"_id": d["chunk_id"],
|
||||
"_source": d,
|
||||
}
|
||||
)
|
||||
if not actions:
|
||||
return 0, 0
|
||||
success, errors = bulk(get_opensearch(), actions, raise_on_error=False, request_timeout=120)
|
||||
if errors:
|
||||
logger.warning("opensearch.bulk.errors", count=len(errors))
|
||||
return success, len(errors) if isinstance(errors, list) else 0
|
||||
|
||||
|
||||
def delete_by_document(document_id: str, index: str | None = None) -> int:
|
||||
name = index or settings.opensearch_index_chunks
|
||||
client = get_opensearch()
|
||||
if not client.indices.exists(index=name):
|
||||
return 0
|
||||
res = client.delete_by_query(
|
||||
index=name,
|
||||
body={"query": {"term": {"document_id": document_id}}},
|
||||
refresh=True,
|
||||
)
|
||||
return int(res.get("deleted", 0))
|
||||
103
app/indexing/qdrant_client.py
Normal file
103
app/indexing/qdrant_client.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Qdrant client + collection bootstrap + chunk upsert."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Any, Sequence
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
DENSE_VECTOR_NAME = "dense"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_qdrant() -> QdrantClient:
|
||||
return QdrantClient(
|
||||
host=settings.qdrant_host,
|
||||
port=settings.qdrant_port,
|
||||
api_key=settings.qdrant_api_key or None,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
|
||||
def ensure_collection(collection: str | None = None, dim: int | None = None) -> None:
|
||||
name = collection or settings.qdrant_collection_chunks
|
||||
vector_size = dim or settings.embedding_dim
|
||||
client = get_qdrant()
|
||||
existing = {c.name for c in client.get_collections().collections}
|
||||
if name in existing:
|
||||
logger.debug("qdrant.collection.exists", collection=name)
|
||||
return
|
||||
logger.info("qdrant.collection.create", collection=name, dim=vector_size)
|
||||
client.create_collection(
|
||||
collection_name=name,
|
||||
vectors_config={
|
||||
DENSE_VECTOR_NAME: qm.VectorParams(
|
||||
size=vector_size,
|
||||
distance=qm.Distance.COSINE,
|
||||
)
|
||||
},
|
||||
optimizers_config=qm.OptimizersConfigDiff(default_segment_number=2),
|
||||
)
|
||||
# Payload indexes for filtering.
|
||||
for field in ("document_id", "source_path", "block_type"):
|
||||
client.create_payload_index(
|
||||
collection_name=name,
|
||||
field_name=field,
|
||||
field_schema=qm.PayloadSchemaType.KEYWORD,
|
||||
)
|
||||
client.create_payload_index(
|
||||
collection_name=name,
|
||||
field_name="page_number",
|
||||
field_schema=qm.PayloadSchemaType.INTEGER,
|
||||
)
|
||||
client.create_payload_index(
|
||||
collection_name=name,
|
||||
field_name="ocr_confidence",
|
||||
field_schema=qm.PayloadSchemaType.FLOAT,
|
||||
)
|
||||
|
||||
|
||||
def upsert_chunks(
|
||||
points: Sequence[tuple[str, list[float], dict[str, Any]]],
|
||||
collection: str | None = None,
|
||||
) -> int:
|
||||
"""Upsert (chunk_id, vector, payload) triples. Returns count upserted."""
|
||||
name = collection or settings.qdrant_collection_chunks
|
||||
if not points:
|
||||
return 0
|
||||
qpoints = [
|
||||
qm.PointStruct(
|
||||
id=_qid(chunk_id),
|
||||
vector={DENSE_VECTOR_NAME: vector},
|
||||
payload={**payload, "chunk_id": chunk_id},
|
||||
)
|
||||
for chunk_id, vector, payload in points
|
||||
]
|
||||
get_qdrant().upsert(collection_name=name, points=qpoints, wait=False)
|
||||
return len(qpoints)
|
||||
|
||||
|
||||
def delete_by_document(document_id: str, collection: str | None = None) -> int:
|
||||
name = collection or settings.qdrant_collection_chunks
|
||||
client = get_qdrant()
|
||||
client.delete(
|
||||
collection_name=name,
|
||||
points_selector=qm.FilterSelector(
|
||||
filter=qm.Filter(
|
||||
must=[qm.FieldCondition(key="document_id", match=qm.MatchValue(value=document_id))]
|
||||
)
|
||||
),
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
def _qid(chunk_id: str) -> str:
|
||||
"""Qdrant accepts UUID strings or unsigned ints. Chunks are UUIDs already."""
|
||||
return chunk_id
|
||||
75
app/indexing/reranker.py
Normal file
75
app/indexing/reranker.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""BGE reranker - cross-encoder style scoring of (query, passage) pairs.
|
||||
|
||||
Designed to degrade gracefully:
|
||||
- If the model fails to load, ``rerank`` returns inputs unchanged with the
|
||||
``reranked`` flag set to False so the API can report the truth to clients.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Sequence
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class Reranker:
|
||||
def __init__(self, model_name: str, device: str, batch_size: int) -> None:
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.batch_size = batch_size
|
||||
self._impl: str | None = None
|
||||
self._model = None
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
try:
|
||||
from FlagEmbedding import FlagReranker # type: ignore
|
||||
use_fp16 = self.device != "cpu"
|
||||
self._model = FlagReranker(self.model_name, use_fp16=use_fp16, devices=self.device)
|
||||
self._impl = "flagembedding"
|
||||
logger.info("reranker.loaded", impl="flagembedding", model=self.model_name, device=self.device)
|
||||
return
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("reranker.flagembedding_failed", error=str(exc))
|
||||
|
||||
try:
|
||||
from sentence_transformers import CrossEncoder
|
||||
self._model = CrossEncoder(self.model_name, device=self.device)
|
||||
self._impl = "sentence-transformers"
|
||||
logger.info("reranker.loaded", impl="sentence-transformers", model=self.model_name)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("reranker.disabled", error=str(exc))
|
||||
self._impl = None
|
||||
self._model = None
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return self._impl is not None and self._model is not None
|
||||
|
||||
def score(self, query: str, passages: Sequence[str]) -> list[float]:
|
||||
if not self.available or not passages:
|
||||
return [0.0] * len(passages)
|
||||
pairs = [(query, p) for p in passages]
|
||||
if self._impl == "flagembedding":
|
||||
scores = self._model.compute_score(pairs, batch_size=self.batch_size, normalize=True) # type: ignore[union-attr]
|
||||
else:
|
||||
scores = self._model.predict(pairs, batch_size=self.batch_size) # type: ignore[union-attr]
|
||||
if not isinstance(scores, list):
|
||||
try:
|
||||
scores = list(scores)
|
||||
except TypeError:
|
||||
scores = [float(scores)]
|
||||
return [float(s) for s in scores]
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_reranker() -> Reranker:
|
||||
return Reranker(
|
||||
model_name=settings.reranker_model,
|
||||
device=settings.reranker_device,
|
||||
batch_size=settings.reranker_batch_size,
|
||||
)
|
||||
0
app/ingestion/__init__.py
Normal file
0
app/ingestion/__init__.py
Normal file
317
app/ingestion/chunker.py
Normal file
317
app/ingestion/chunker.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""Structure-aware chunking.
|
||||
|
||||
Rules (per spec):
|
||||
- Chunk by document structure first, fixed-size second.
|
||||
- Hierarchy: title > heading > paragraph > list > table > figure caption.
|
||||
- Target 500-900 tokens (configurable).
|
||||
- Overlap 80-120 tokens for long narrative text only.
|
||||
- Never split tables - one table = one chunk (or one chunk per row group if huge).
|
||||
- Every chunk carries citation metadata.
|
||||
|
||||
We use a deliberately simple ``len(text.split())`` token estimator. The downstream
|
||||
embedding model has its own tokenizer; this estimator is only a budget proxy.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
from app.ingestion.docling_extractor import (
|
||||
ExtractedBlock,
|
||||
ExtractedFigure,
|
||||
ExtractedTable,
|
||||
ExtractionResult,
|
||||
)
|
||||
from app.ingestion.normalizer import normalize_block
|
||||
from app.ingestion.quality import compute_quality_flags
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkRecord:
|
||||
chunk_index: int
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
normalized_text: str
|
||||
token_count: int
|
||||
block_id: str | None = None
|
||||
quality_flags: dict[str, Any] = field(default_factory=dict)
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _estimate_tokens(text: str) -> int:
|
||||
return max(1, len(text.split()))
|
||||
|
||||
|
||||
def chunk_extraction(
|
||||
extraction: ExtractionResult,
|
||||
*,
|
||||
document_ocr_confidence: float | None = None,
|
||||
) -> list[ChunkRecord]:
|
||||
target = settings.chunk_target_tokens
|
||||
minimum = settings.chunk_min_tokens
|
||||
maximum = settings.chunk_max_tokens
|
||||
overlap = settings.chunk_overlap_tokens
|
||||
|
||||
chunks: list[ChunkRecord] = []
|
||||
idx = 0
|
||||
|
||||
# 1) Tables first - one chunk per table, never split.
|
||||
for t in extraction.tables:
|
||||
body = (t.markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
summary = _summarize_table(t)
|
||||
text = body
|
||||
if summary:
|
||||
text = f"{summary}\n\n{body}"
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type="table",
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=t.page_number,
|
||||
block_type="table",
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=t.block_id or f"table:{t.table_index}",
|
||||
quality_flags=flags,
|
||||
metadata={"table_index": t.table_index, "summary": summary or ""},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# 2) Figures - caption + placeholder description.
|
||||
for f in extraction.figures:
|
||||
text_parts: list[str] = []
|
||||
if f.caption:
|
||||
text_parts.append(f"Caption: {f.caption}")
|
||||
text_parts.append(f"Figure detected on page {f.page_number}.")
|
||||
text = "\n".join(text_parts)
|
||||
block_type = "figure_caption" if f.caption else "figure_description"
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type=block_type,
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=f.page_number,
|
||||
block_type=block_type,
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=f.block_id or f"figure:{f.figure_index}",
|
||||
quality_flags=flags,
|
||||
metadata={"figure_index": f.figure_index},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# 3) Narrative blocks grouped per page, packed by structure.
|
||||
by_page: dict[int, list[ExtractedBlock]] = {}
|
||||
for b in extraction.blocks:
|
||||
by_page.setdefault(b.page_number, []).append(b)
|
||||
|
||||
for page_no in sorted(by_page):
|
||||
blocks = by_page[page_no]
|
||||
groups = _group_by_section(blocks)
|
||||
for group in groups:
|
||||
packed = _pack_group(group, target=target, maximum=maximum, minimum=minimum)
|
||||
for piece in packed:
|
||||
text = piece["text"]
|
||||
btype = piece["block_type"]
|
||||
display, norm = normalize_block(text)
|
||||
flags = compute_quality_flags(
|
||||
text=display,
|
||||
block_type=btype,
|
||||
ocr_confidence=document_ocr_confidence,
|
||||
)
|
||||
chunks.append(
|
||||
ChunkRecord(
|
||||
chunk_index=idx,
|
||||
page_number=page_no,
|
||||
block_type=btype,
|
||||
text=display,
|
||||
normalized_text=norm,
|
||||
token_count=_estimate_tokens(display),
|
||||
block_id=piece.get("block_id"),
|
||||
quality_flags=flags,
|
||||
metadata={"section_heading": piece.get("section") or ""},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
# Optional overlap: only if the last piece is long narrative
|
||||
if overlap > 0 and packed and packed[-1]["block_type"] == "paragraph":
|
||||
tail = _tail_tokens(packed[-1]["text"], overlap)
|
||||
if tail and len(tail.split()) >= max(20, overlap // 2):
|
||||
# Overlap is already represented by next-group adjacency in
|
||||
# most legacy docs; we do not emit duplicate overlap chunks
|
||||
# to avoid index bloat. This is intentional per spec note
|
||||
# ("only for long narrative text") - left here for future tuning.
|
||||
pass
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------- Helpers ----------------
|
||||
|
||||
def _group_by_section(blocks: list[ExtractedBlock]) -> list[list[ExtractedBlock]]:
|
||||
groups: list[list[ExtractedBlock]] = []
|
||||
current: list[ExtractedBlock] = []
|
||||
for b in blocks:
|
||||
if b.block_type in ("title", "heading") and current:
|
||||
groups.append(current)
|
||||
current = [b]
|
||||
else:
|
||||
current.append(b)
|
||||
if current:
|
||||
groups.append(current)
|
||||
return groups
|
||||
|
||||
|
||||
def _pack_group(
|
||||
group: list[ExtractedBlock], *, target: int, maximum: int, minimum: int
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Pack a section's blocks into chunks at most ``maximum`` tokens.
|
||||
|
||||
Headings / titles attach to the next chunk as a section anchor.
|
||||
"""
|
||||
if not group:
|
||||
return []
|
||||
|
||||
section_heading = ""
|
||||
body_blocks: list[ExtractedBlock] = []
|
||||
for b in group:
|
||||
if b.block_type in ("title", "heading"):
|
||||
section_heading = (section_heading + " > " + b.text).strip(" >") if section_heading else b.text
|
||||
else:
|
||||
body_blocks.append(b)
|
||||
|
||||
if not body_blocks:
|
||||
# Heading-only group: emit as a single ``heading`` chunk so the title is searchable.
|
||||
text = section_heading or group[0].text
|
||||
return [
|
||||
{
|
||||
"text": text,
|
||||
"block_type": "heading",
|
||||
"block_id": group[0].block_id,
|
||||
"section": section_heading,
|
||||
}
|
||||
]
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
buffer: list[str] = []
|
||||
buffer_block_ids: list[str] = []
|
||||
buffer_block_type = "paragraph"
|
||||
buffer_tokens = 0
|
||||
|
||||
def flush():
|
||||
nonlocal buffer, buffer_block_ids, buffer_block_type, buffer_tokens
|
||||
if not buffer:
|
||||
return
|
||||
text = "\n\n".join(buffer).strip()
|
||||
if not text:
|
||||
buffer = []
|
||||
buffer_block_ids = []
|
||||
buffer_tokens = 0
|
||||
return
|
||||
# Prepend section heading for context (kept short).
|
||||
if section_heading and len(section_heading) < 200:
|
||||
text = f"# {section_heading}\n\n{text}"
|
||||
out.append(
|
||||
{
|
||||
"text": text,
|
||||
"block_type": buffer_block_type,
|
||||
"block_id": buffer_block_ids[0] if buffer_block_ids else None,
|
||||
"section": section_heading,
|
||||
}
|
||||
)
|
||||
buffer = []
|
||||
buffer_block_ids = []
|
||||
buffer_tokens = 0
|
||||
|
||||
for b in body_blocks:
|
||||
tokens = _estimate_tokens(b.text)
|
||||
if tokens >= maximum:
|
||||
# Hard split a giant block into sub-chunks of ~target tokens.
|
||||
flush()
|
||||
for sub in _split_long_text(b.text, target=target, maximum=maximum):
|
||||
out.append(
|
||||
{
|
||||
"text": sub,
|
||||
"block_type": b.block_type if b.block_type != "list" else "list",
|
||||
"block_id": b.block_id,
|
||||
"section": section_heading,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
if buffer_tokens + tokens > maximum and buffer_tokens >= minimum:
|
||||
flush()
|
||||
|
||||
if not buffer:
|
||||
buffer_block_type = b.block_type if b.block_type != "list" else "list"
|
||||
buffer.append(b.text)
|
||||
if b.block_id:
|
||||
buffer_block_ids.append(b.block_id)
|
||||
buffer_tokens += tokens
|
||||
|
||||
if buffer_tokens >= target:
|
||||
flush()
|
||||
|
||||
flush()
|
||||
return out
|
||||
|
||||
|
||||
def _split_long_text(text: str, *, target: int, maximum: int) -> list[str]:
|
||||
words = text.split()
|
||||
if not words:
|
||||
return []
|
||||
pieces: list[str] = []
|
||||
step = target
|
||||
if step <= 0:
|
||||
step = 500
|
||||
i = 0
|
||||
while i < len(words):
|
||||
end = min(len(words), i + maximum)
|
||||
# Aim for ``target`` words but extend up to ``maximum`` to reach a sentence boundary.
|
||||
piece = " ".join(words[i : i + step])
|
||||
pieces.append(piece)
|
||||
i += step
|
||||
if end - i < target // 4 and end - i > 0:
|
||||
pieces[-1] = " ".join(words[i - step : end])
|
||||
break
|
||||
return pieces
|
||||
|
||||
|
||||
def _tail_tokens(text: str, n: int) -> str:
|
||||
words = text.split()
|
||||
if len(words) <= n:
|
||||
return text
|
||||
return " ".join(words[-n:])
|
||||
|
||||
|
||||
def _summarize_table(t: ExtractedTable) -> str:
|
||||
"""Heuristic one-line summary for index recall."""
|
||||
md = t.markdown or ""
|
||||
first = next((line for line in md.splitlines() if line.startswith("|")), "")
|
||||
header_cells = [c.strip() for c in first.strip("|").split("|") if c.strip()]
|
||||
n_cols = len(header_cells)
|
||||
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
||||
header_preview = ", ".join(header_cells[:6])
|
||||
return (
|
||||
f"Table on page {t.page_number}: {n_rows} rows x {n_cols} cols. "
|
||||
f"Columns: {header_preview}." if header_cells else
|
||||
f"Table on page {t.page_number}."
|
||||
)
|
||||
384
app/ingestion/docling_extractor.py
Normal file
384
app/ingestion/docling_extractor.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""Docling structured extraction.
|
||||
|
||||
Docling produces a hierarchical document model with reading order, layout, tables
|
||||
and figures. We export both Markdown and a JSON representation, then walk the
|
||||
JSON to emit normalized blocks (title, heading, paragraph, list, table caption,
|
||||
figure caption) for downstream chunking.
|
||||
|
||||
The extractor is intentionally defensive: Docling's exact Python API has
|
||||
shifted across releases. We probe for the safest exporter methods and fall
|
||||
back to ``str(document)`` only as a last resort.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedBlock:
|
||||
page_number: int
|
||||
block_type: str
|
||||
text: str
|
||||
block_id: str | None = None
|
||||
extra: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedTable:
|
||||
page_number: int
|
||||
table_index: int
|
||||
markdown: str
|
||||
csv_text: str | None = None
|
||||
json_data: dict[str, Any] | None = None
|
||||
block_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedFigure:
|
||||
page_number: int
|
||||
figure_index: int
|
||||
caption: str | None
|
||||
block_id: str | None = None
|
||||
image_bytes: bytes | None = None
|
||||
image_ext: str = "png"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedPage:
|
||||
page_number: int
|
||||
text: str
|
||||
has_tables: bool = False
|
||||
has_figures: bool = False
|
||||
has_handwriting: bool = False
|
||||
ocr_confidence: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
markdown: str
|
||||
json_payload: dict[str, Any]
|
||||
blocks: list[ExtractedBlock]
|
||||
tables: list[ExtractedTable]
|
||||
figures: list[ExtractedFigure]
|
||||
pages: list[ExtractedPage]
|
||||
|
||||
|
||||
def extract(pdf_path: Path) -> ExtractionResult:
|
||||
"""Run Docling on ``pdf_path`` and return a normalized result."""
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
# We let OCRmyPDF do the heavy OCR; Docling OCR is opt-in.
|
||||
pipeline_options.do_ocr = settings.docling_ocr_enabled
|
||||
pipeline_options.do_table_structure = True
|
||||
try:
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
except Exception: # noqa: BLE001 - older docling versions lack this
|
||||
pass
|
||||
try:
|
||||
pipeline_options.generate_page_images = True
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
||||
)
|
||||
|
||||
logger.info("docling.start", input=str(pdf_path))
|
||||
conv = converter.convert(str(pdf_path))
|
||||
doc = conv.document
|
||||
|
||||
markdown = _safe_export_markdown(doc)
|
||||
json_payload = _safe_export_dict(doc)
|
||||
|
||||
blocks = _walk_blocks(json_payload)
|
||||
tables = _walk_tables(doc, json_payload)
|
||||
figures = _walk_figures(doc, json_payload)
|
||||
pages = _walk_pages(json_payload, blocks, tables, figures)
|
||||
|
||||
logger.info(
|
||||
"docling.done",
|
||||
pages=len(pages),
|
||||
blocks=len(blocks),
|
||||
tables=len(tables),
|
||||
figures=len(figures),
|
||||
)
|
||||
return ExtractionResult(
|
||||
markdown=markdown,
|
||||
json_payload=json_payload,
|
||||
blocks=blocks,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
pages=pages,
|
||||
)
|
||||
|
||||
|
||||
# ---------------- Internal helpers ----------------
|
||||
|
||||
def _safe_export_markdown(doc: Any) -> str:
|
||||
for attr in ("export_to_markdown", "to_markdown"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
return fn()
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
return str(doc)
|
||||
|
||||
|
||||
def _safe_export_dict(doc: Any) -> dict[str, Any]:
|
||||
for attr in ("export_to_dict", "model_dump", "dict"):
|
||||
fn = getattr(doc, attr, None)
|
||||
if callable(fn):
|
||||
try:
|
||||
data = fn()
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
# Last resort: serialize via JSON round-trip
|
||||
try:
|
||||
return json.loads(getattr(doc, "model_dump_json", lambda: "{}")())
|
||||
except Exception: # noqa: BLE001
|
||||
return {}
|
||||
|
||||
|
||||
_DOCLING_LABEL_TO_BLOCK = {
|
||||
"title": "title",
|
||||
"section_header": "heading",
|
||||
"section-header": "heading",
|
||||
"subtitle": "heading",
|
||||
"page_header": "heading",
|
||||
"header": "heading",
|
||||
"list_item": "list",
|
||||
"list-item": "list",
|
||||
"list": "list",
|
||||
"paragraph": "paragraph",
|
||||
"text": "paragraph",
|
||||
"caption": "figure_caption",
|
||||
"figure": "figure_caption",
|
||||
"table": "table",
|
||||
"footnote": "paragraph",
|
||||
}
|
||||
|
||||
|
||||
def _walk_blocks(payload: dict[str, Any]) -> list[ExtractedBlock]:
|
||||
"""Flatten Docling's text items into ordered blocks per page."""
|
||||
blocks: list[ExtractedBlock] = []
|
||||
items = (
|
||||
payload.get("texts")
|
||||
or payload.get("text_items")
|
||||
or payload.get("body", {}).get("text_items", [])
|
||||
or []
|
||||
)
|
||||
if not isinstance(items, list):
|
||||
return blocks
|
||||
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
label = (item.get("label") or item.get("category") or "paragraph").lower()
|
||||
text = (item.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
block_type = _DOCLING_LABEL_TO_BLOCK.get(label, "paragraph")
|
||||
page = _page_of(item)
|
||||
blocks.append(
|
||||
ExtractedBlock(
|
||||
page_number=page,
|
||||
block_type=block_type,
|
||||
text=text,
|
||||
block_id=item.get("self_ref") or item.get("id"),
|
||||
extra={"label": label},
|
||||
)
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
def _walk_tables(doc: Any, payload: dict[str, Any]) -> list[ExtractedTable]:
|
||||
tables: list[ExtractedTable] = []
|
||||
raw_tables = payload.get("tables") or []
|
||||
for idx, t in enumerate(raw_tables):
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
page = _page_of(t)
|
||||
md = _table_markdown(doc, t, idx)
|
||||
csv_text = _table_csv(t)
|
||||
tables.append(
|
||||
ExtractedTable(
|
||||
page_number=page,
|
||||
table_index=idx,
|
||||
markdown=md,
|
||||
csv_text=csv_text,
|
||||
json_data=t,
|
||||
block_id=t.get("self_ref") or t.get("id"),
|
||||
)
|
||||
)
|
||||
return tables
|
||||
|
||||
|
||||
def _walk_figures(doc: Any, payload: dict[str, Any]) -> list[ExtractedFigure]:
|
||||
figures: list[ExtractedFigure] = []
|
||||
raw_figures = payload.get("pictures") or payload.get("figures") or []
|
||||
for idx, f in enumerate(raw_figures):
|
||||
if not isinstance(f, dict):
|
||||
continue
|
||||
page = _page_of(f)
|
||||
caption = (f.get("caption") or "").strip() or None
|
||||
figures.append(
|
||||
ExtractedFigure(
|
||||
page_number=page,
|
||||
figure_index=idx,
|
||||
caption=caption,
|
||||
block_id=f.get("self_ref") or f.get("id"),
|
||||
)
|
||||
)
|
||||
return figures
|
||||
|
||||
|
||||
def _walk_pages(
|
||||
payload: dict[str, Any],
|
||||
blocks: list[ExtractedBlock],
|
||||
tables: list[ExtractedTable],
|
||||
figures: list[ExtractedFigure],
|
||||
) -> list[ExtractedPage]:
|
||||
pages_meta = payload.get("pages") or {}
|
||||
page_numbers: set[int] = set()
|
||||
if isinstance(pages_meta, dict):
|
||||
for k in pages_meta.keys():
|
||||
try:
|
||||
page_numbers.add(int(k))
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
elif isinstance(pages_meta, list):
|
||||
for p in pages_meta:
|
||||
if isinstance(p, dict):
|
||||
pn = p.get("page_no") or p.get("page") or p.get("number")
|
||||
if isinstance(pn, int):
|
||||
page_numbers.add(pn)
|
||||
|
||||
for b in blocks:
|
||||
page_numbers.add(b.page_number)
|
||||
for t in tables:
|
||||
page_numbers.add(t.page_number)
|
||||
for f in figures:
|
||||
page_numbers.add(f.page_number)
|
||||
page_numbers.discard(0)
|
||||
if not page_numbers:
|
||||
page_numbers = {1}
|
||||
|
||||
by_page_text: dict[int, list[str]] = {pn: [] for pn in page_numbers}
|
||||
for b in blocks:
|
||||
by_page_text.setdefault(b.page_number, []).append(b.text)
|
||||
|
||||
has_tables_set = {t.page_number for t in tables}
|
||||
has_figures_set = {f.page_number for f in figures}
|
||||
|
||||
return [
|
||||
ExtractedPage(
|
||||
page_number=pn,
|
||||
text="\n\n".join(by_page_text.get(pn, [])),
|
||||
has_tables=pn in has_tables_set,
|
||||
has_figures=pn in has_figures_set,
|
||||
)
|
||||
for pn in sorted(page_numbers)
|
||||
]
|
||||
|
||||
|
||||
def _page_of(item: dict[str, Any]) -> int:
|
||||
prov = item.get("prov") or item.get("provenance")
|
||||
if isinstance(prov, list) and prov:
|
||||
first = prov[0]
|
||||
if isinstance(first, dict):
|
||||
pn = first.get("page_no") or first.get("page") or first.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
pn = item.get("page_no") or item.get("page") or item.get("page_number")
|
||||
if isinstance(pn, int):
|
||||
return pn
|
||||
return 1
|
||||
|
||||
|
||||
def _table_markdown(doc: Any, raw: dict[str, Any], idx: int) -> str:
|
||||
# Try Docling's own export first (per-table).
|
||||
try:
|
||||
export = getattr(doc, "export_table_to_markdown", None)
|
||||
if callable(export):
|
||||
return export(idx)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
grid = raw.get("data") or raw.get("table_cells") or raw.get("grid")
|
||||
if isinstance(grid, list) and grid and isinstance(grid[0], list):
|
||||
return _grid_to_markdown(grid)
|
||||
cells = raw.get("table_cells")
|
||||
if isinstance(cells, list):
|
||||
return _cells_to_markdown(cells)
|
||||
return ""
|
||||
|
||||
|
||||
def _grid_to_markdown(grid: list[list[Any]]) -> str:
|
||||
if not grid:
|
||||
return ""
|
||||
|
||||
def _cell(c: Any) -> str:
|
||||
if isinstance(c, dict):
|
||||
return str(c.get("text") or c.get("value") or "").replace("|", "\\|").strip()
|
||||
return str(c).replace("|", "\\|").strip()
|
||||
|
||||
header = grid[0]
|
||||
body = grid[1:] if len(grid) > 1 else []
|
||||
cols = len(header)
|
||||
out = ["| " + " | ".join(_cell(c) for c in header) + " |"]
|
||||
out.append("| " + " | ".join(["---"] * cols) + " |")
|
||||
for row in body:
|
||||
cells = [_cell(c) for c in row]
|
||||
if len(cells) < cols:
|
||||
cells += [""] * (cols - len(cells))
|
||||
out.append("| " + " | ".join(cells[:cols]) + " |")
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _cells_to_markdown(cells: list[Any]) -> str:
|
||||
rows: dict[int, dict[int, str]] = {}
|
||||
for c in cells:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
r = c.get("start_row_offset_idx", c.get("row", 0)) or 0
|
||||
col = c.get("start_col_offset_idx", c.get("col", 0)) or 0
|
||||
rows.setdefault(r, {})[col] = (c.get("text") or "").replace("|", "\\|").strip()
|
||||
if not rows:
|
||||
return ""
|
||||
max_col = max((max(r.keys()) for r in rows.values()), default=0)
|
||||
grid = []
|
||||
for r_idx in sorted(rows):
|
||||
row = [rows[r_idx].get(c, "") for c in range(max_col + 1)]
|
||||
grid.append(row)
|
||||
return _grid_to_markdown(grid)
|
||||
|
||||
|
||||
def _table_csv(raw: dict[str, Any]) -> str | None:
|
||||
grid = raw.get("data") or raw.get("grid")
|
||||
if not (isinstance(grid, list) and grid and isinstance(grid[0], list)):
|
||||
return None
|
||||
import csv
|
||||
import io
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf)
|
||||
for row in grid:
|
||||
writer.writerow([
|
||||
(c.get("text") if isinstance(c, dict) else c) or "" for c in row
|
||||
])
|
||||
return buf.getvalue()
|
||||
78
app/ingestion/figure_processor.py
Normal file
78
app/ingestion/figure_processor.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Figure
|
||||
from app.ingestion.docling_extractor import ExtractedFigure
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_figure_crop
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_figures(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
figures: list[ExtractedFigure],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for f in figures:
|
||||
existing = db.execute(
|
||||
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Figure(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(f.page_number),
|
||||
page_number=f.page_number,
|
||||
figure_index=f.figure_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.caption = f.caption
|
||||
existing.description = (
|
||||
f"Figure detected on page {f.page_number}." if not f.caption else
|
||||
f"Figure on page {f.page_number}. Caption: {f.caption}"
|
||||
)
|
||||
|
||||
if f.image_bytes:
|
||||
key = key_figure_crop(document_id, f.page_number, f.figure_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=f.image_bytes,
|
||||
content_type=f"image/{f.image_ext}",
|
||||
)
|
||||
existing.storage_bucket = storage.derived_bucket
|
||||
existing.storage_key = key
|
||||
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
12
app/ingestion/normalizer.py
Normal file
12
app/ingestion/normalizer.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Block-level normalization wrappers around utils.text_cleaning."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.utils.text_cleaning import clean_ocr_text, normalize_for_search
|
||||
|
||||
|
||||
def normalize_block(text: str) -> tuple[str, str]:
|
||||
"""Return ``(display_text, normalized_text)``."""
|
||||
display = clean_ocr_text(text)
|
||||
norm = normalize_for_search(display)
|
||||
return display, norm
|
||||
87
app/ingestion/ocr.py
Normal file
87
app/ingestion/ocr.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""OCRmyPDF integration with Tesseract.
|
||||
|
||||
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
|
||||
disabled by config), we skip OCR and use the original PDF. On failure, the
|
||||
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
from app.utils.pdf import has_searchable_text
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OcrResult:
|
||||
output_path: Path
|
||||
skipped: bool
|
||||
reason: str
|
||||
languages: str
|
||||
|
||||
|
||||
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
|
||||
"""Run OCRmyPDF.
|
||||
|
||||
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
|
||||
- If the input already has searchable text: skip OCR but still produce
|
||||
``output_pdf`` (a hard-link / copy to keep downstream code simple).
|
||||
- On unexpected exceptions: re-raise (caller handles status update).
|
||||
"""
|
||||
langs = languages or settings.ocr_languages
|
||||
|
||||
if not settings.ocr_enabled:
|
||||
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
|
||||
|
||||
if has_searchable_text(input_pdf):
|
||||
return _skip(input_pdf, output_pdf, langs, "already_searchable")
|
||||
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
|
||||
|
||||
try:
|
||||
ocrmypdf.ocr(
|
||||
input_file=str(input_pdf),
|
||||
output_file=str(output_pdf),
|
||||
language=langs,
|
||||
skip_text=False,
|
||||
redo_ocr=False,
|
||||
force_ocr=False,
|
||||
deskew=settings.ocr_deskew,
|
||||
clean=settings.ocr_clean,
|
||||
optimize=settings.ocr_optimize,
|
||||
progress_bar=False,
|
||||
jobs=1,
|
||||
output_type="pdf",
|
||||
# tolerate already-OCR pages where present
|
||||
skip_big=200.0,
|
||||
)
|
||||
except ocrmypdf.exceptions.PriorOcrFoundError:
|
||||
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
|
||||
except ocrmypdf.exceptions.DigitalSignatureError:
|
||||
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
|
||||
except ocrmypdf.exceptions.EncryptedPdfError as exc:
|
||||
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
|
||||
raise
|
||||
except ocrmypdf.exceptions.MissingDependencyError as exc:
|
||||
logger.error("ocr.missing_dependency", error=str(exc))
|
||||
raise
|
||||
|
||||
logger.info("ocr.done", output=str(output_pdf))
|
||||
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
|
||||
|
||||
|
||||
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
|
||||
output_pdf.write_bytes(input_pdf.read_bytes())
|
||||
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)
|
||||
384
app/ingestion/pipeline.py
Normal file
384
app/ingestion/pipeline.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""Per-document end-to-end pipeline: OCR -> Docling -> chunk -> persist -> index.
|
||||
|
||||
Called by the Celery worker. Idempotent: re-running on the same document deletes
|
||||
existing chunks for that document and re-creates them, then re-indexes in
|
||||
OpenSearch and Qdrant.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import delete, select
|
||||
|
||||
from app.config import settings
|
||||
from app.db.models import (
|
||||
ArtifactType,
|
||||
Chunk,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentStatus,
|
||||
Page,
|
||||
ProcessingEvent,
|
||||
)
|
||||
from app.db.session import session_scope
|
||||
from app.indexing import opensearch_client, qdrant_client
|
||||
from app.indexing.embeddings import get_embedder
|
||||
from app.ingestion.chunker import ChunkRecord, chunk_extraction
|
||||
from app.ingestion.docling_extractor import ExtractionResult, extract
|
||||
from app.ingestion.figure_processor import persist_figures
|
||||
from app.ingestion.ocr import run_ocr
|
||||
from app.ingestion.table_processor import persist_tables
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import (
|
||||
key_docling_json,
|
||||
key_markdown,
|
||||
key_ocr_pdf,
|
||||
work_dir_for,
|
||||
)
|
||||
from app.storage.minio_client import get_storage
|
||||
from app.utils.language import detect_language
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def process_document_id(document_id: uuid.UUID, run_id: uuid.UUID | None = None) -> dict[str, Any]:
|
||||
"""Top-level entry called by the Celery task. Wraps the pipeline in
|
||||
error handling so the task always either succeeds or marks the document FAILED.
|
||||
"""
|
||||
storage = get_storage()
|
||||
storage.ensure_buckets()
|
||||
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
logger.warning("pipeline.document_missing", document_id=str(document_id))
|
||||
return {"status": "missing"}
|
||||
|
||||
source_path = Path(doc.source_path)
|
||||
sha = doc.sha256
|
||||
original_artifact = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == doc.id,
|
||||
DocumentArtifact.artifact_type == ArtifactType.ORIGINAL_PDF,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
work_dir = work_dir_for(document_id)
|
||||
local_pdf = work_dir / f"{sha}.pdf"
|
||||
if not local_pdf.exists():
|
||||
if source_path.exists():
|
||||
local_pdf.write_bytes(source_path.read_bytes())
|
||||
elif original_artifact:
|
||||
storage.get_to_path(original_artifact.storage_bucket, original_artifact.storage_key, local_pdf)
|
||||
else:
|
||||
return _fail(document_id, run_id, "OCR_FAILED", "Original PDF not available locally or in MinIO")
|
||||
|
||||
# ---------------- OCR ----------------
|
||||
ocr_pdf = work_dir / "ocr.pdf"
|
||||
try:
|
||||
_emit_event(document_id, run_id, DocumentStatus.OCR_STARTED, "OCR started")
|
||||
ocr_result = run_ocr(local_pdf, ocr_pdf, languages=settings.ocr_languages)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.ocr_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.OCR_FAILED, f"OCR failed: {exc}")
|
||||
|
||||
# Upload OCR PDF (even if we 'skipped' it - OCR PDF is the canonical input to Docling).
|
||||
ocr_key = key_ocr_pdf(document_id)
|
||||
storage.put_file(
|
||||
bucket=storage.derived_bucket,
|
||||
key=ocr_key,
|
||||
path=ocr_result.output_path,
|
||||
content_type="application/pdf",
|
||||
)
|
||||
with session_scope() as db:
|
||||
_ensure_artifact(db, document_id, ArtifactType.OCR_PDF, storage.derived_bucket, ocr_key)
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.OCR_COMPLETED,
|
||||
level="INFO",
|
||||
message=f"OCR finished ({ocr_result.reason})",
|
||||
data={"skipped": ocr_result.skipped, "languages": ocr_result.languages},
|
||||
)
|
||||
)
|
||||
|
||||
# ---------------- Docling ----------------
|
||||
try:
|
||||
_emit_event(document_id, run_id, DocumentStatus.EXTRACTION_STARTED, "Docling extraction started")
|
||||
extraction = extract(ocr_result.output_path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.docling_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.EXTRACTION_FAILED, f"Docling failed: {exc}")
|
||||
|
||||
# Persist Markdown + JSON to MinIO.
|
||||
md_key = key_markdown(document_id)
|
||||
json_key = key_docling_json(document_id)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=md_key,
|
||||
data=extraction.markdown.encode("utf-8"),
|
||||
content_type="text/markdown",
|
||||
)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=json_key,
|
||||
data=json.dumps(extraction.json_payload, ensure_ascii=False).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
# ---------------- Persist pages, chunks, tables, figures ----------------
|
||||
chunk_records = chunk_extraction(extraction)
|
||||
sample_text = "\n".join(p.text for p in extraction.pages[:3] if p.text)
|
||||
lang = detect_language(sample_text)
|
||||
|
||||
with session_scope() as db:
|
||||
_ensure_artifact(db, document_id, ArtifactType.MARKDOWN, storage.derived_bucket, md_key)
|
||||
_ensure_artifact(db, document_id, ArtifactType.DOCLING_JSON, storage.derived_bucket, json_key)
|
||||
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
return {"status": "missing"}
|
||||
doc.status = DocumentStatus.EXTRACTION_COMPLETED
|
||||
if lang and not doc.language_hint:
|
||||
doc.language_hint = lang
|
||||
|
||||
page_id_by_number = _upsert_pages(db, document_id, extraction)
|
||||
persist_tables(db, storage, document_id, extraction.tables, page_id_by_number)
|
||||
persist_figures(db, storage, document_id, extraction.figures, page_id_by_number)
|
||||
|
||||
# Replace chunks idempotently: drop all and re-insert.
|
||||
db.execute(delete(Chunk).where(Chunk.document_id == document_id))
|
||||
for cr in chunk_records:
|
||||
db.add(_to_chunk_row(document_id, page_id_by_number, cr))
|
||||
|
||||
doc.status = DocumentStatus.CHUNKING_COMPLETED
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.CHUNKING_COMPLETED,
|
||||
level="INFO",
|
||||
message="Chunking complete",
|
||||
data={"chunks": len(chunk_records)},
|
||||
)
|
||||
)
|
||||
|
||||
# ---------------- Indexing (OpenSearch + Qdrant) ----------------
|
||||
try:
|
||||
opensearch_client.ensure_index()
|
||||
qdrant_client.ensure_collection()
|
||||
opensearch_client.delete_by_document(str(document_id))
|
||||
qdrant_client.delete_by_document(str(document_id))
|
||||
|
||||
os_docs, qdrant_points = _build_index_payloads(document_id, chunk_records, extraction, lang)
|
||||
if os_docs:
|
||||
opensearch_client.index_chunks(os_docs)
|
||||
if qdrant_points:
|
||||
embedder = get_embedder()
|
||||
texts_to_embed = [text for _, text, _ in qdrant_points]
|
||||
vectors = embedder.encode(texts_to_embed)
|
||||
triples = [
|
||||
(chunk_id, vec, payload)
|
||||
for (chunk_id, _text, payload), vec in zip(qdrant_points, vectors, strict=True)
|
||||
]
|
||||
qdrant_client.upsert_chunks(triples)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("pipeline.indexing_failed", document_id=str(document_id))
|
||||
return _fail(document_id, run_id, DocumentStatus.FAILED, f"Indexing failed: {exc}")
|
||||
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = DocumentStatus.INDEXING_COMPLETED
|
||||
doc.error_message = None
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=DocumentStatus.INDEXING_COMPLETED,
|
||||
level="INFO",
|
||||
message="Indexing complete",
|
||||
data={"chunks": len(chunk_records)},
|
||||
)
|
||||
)
|
||||
|
||||
return {"status": DocumentStatus.INDEXING_COMPLETED, "chunks": len(chunk_records)}
|
||||
|
||||
|
||||
# ---------------- helpers ----------------
|
||||
|
||||
def _to_chunk_row(
|
||||
document_id: uuid.UUID, page_id_by_number: dict[int, uuid.UUID], cr: ChunkRecord
|
||||
) -> Chunk:
|
||||
return Chunk(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(cr.page_number),
|
||||
page_number=cr.page_number,
|
||||
block_id=cr.block_id,
|
||||
chunk_index=cr.chunk_index,
|
||||
block_type=cr.block_type,
|
||||
text=cr.text,
|
||||
normalized_text=cr.normalized_text,
|
||||
token_count=cr.token_count,
|
||||
ocr_confidence=None,
|
||||
quality_flags=cr.quality_flags,
|
||||
chunk_metadata=cr.metadata,
|
||||
)
|
||||
|
||||
|
||||
def _upsert_pages(db, document_id: uuid.UUID, extraction: ExtractionResult) -> dict[int, uuid.UUID]:
|
||||
existing = {
|
||||
p.page_number: p
|
||||
for p in db.execute(select(Page).where(Page.document_id == document_id)).scalars()
|
||||
}
|
||||
out: dict[int, uuid.UUID] = {}
|
||||
for ep in extraction.pages:
|
||||
page = existing.get(ep.page_number)
|
||||
if page is None:
|
||||
page = Page(
|
||||
document_id=document_id,
|
||||
page_number=ep.page_number,
|
||||
text=ep.text,
|
||||
ocr_confidence=ep.ocr_confidence,
|
||||
has_tables=ep.has_tables,
|
||||
has_figures=ep.has_figures,
|
||||
has_handwriting=ep.has_handwriting,
|
||||
)
|
||||
db.add(page)
|
||||
db.flush()
|
||||
else:
|
||||
page.text = ep.text
|
||||
page.has_tables = ep.has_tables
|
||||
page.has_figures = ep.has_figures
|
||||
page.has_handwriting = ep.has_handwriting
|
||||
out[ep.page_number] = page.id
|
||||
return out
|
||||
|
||||
|
||||
def _build_index_payloads(
|
||||
document_id: uuid.UUID,
|
||||
chunks: list[ChunkRecord],
|
||||
extraction: ExtractionResult,
|
||||
language_hint: str | None,
|
||||
) -> tuple[list[dict[str, Any]], list[tuple[str, str, dict[str, Any]]]]:
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is None:
|
||||
return [], []
|
||||
original_file_name = doc.original_file_name
|
||||
source_path = doc.source_path
|
||||
|
||||
chunk_rows = (
|
||||
db.execute(select(Chunk).where(Chunk.document_id == document_id))
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
os_docs: list[dict[str, Any]] = []
|
||||
qdrant: list[tuple[str, str, dict[str, Any]]] = []
|
||||
|
||||
for row in chunk_rows:
|
||||
chunk_id = str(row.id)
|
||||
text = row.text or ""
|
||||
os_docs.append(
|
||||
{
|
||||
"chunk_id": chunk_id,
|
||||
"document_id": str(document_id),
|
||||
"source_path": source_path,
|
||||
"original_file_name": original_file_name,
|
||||
"page_number": row.page_number,
|
||||
"block_type": row.block_type,
|
||||
"block_id": row.block_id,
|
||||
"text": text,
|
||||
"normalized_text": row.normalized_text,
|
||||
"ocr_confidence": row.ocr_confidence,
|
||||
"language_hint": language_hint,
|
||||
"metadata": row.chunk_metadata or {},
|
||||
"quality_flags": row.quality_flags or {},
|
||||
"created_at": (row.created_at or datetime.now(tz=timezone.utc)).isoformat(),
|
||||
}
|
||||
)
|
||||
text_preview = text[:512]
|
||||
qdrant.append(
|
||||
(
|
||||
chunk_id,
|
||||
text,
|
||||
{
|
||||
"document_id": str(document_id),
|
||||
"source_path": source_path,
|
||||
"original_file_name": original_file_name,
|
||||
"page_number": row.page_number,
|
||||
"block_type": row.block_type,
|
||||
"block_id": row.block_id,
|
||||
"text_preview": text_preview,
|
||||
"ocr_confidence": row.ocr_confidence,
|
||||
"quality_flags": row.quality_flags or {},
|
||||
"metadata": row.chunk_metadata or {},
|
||||
},
|
||||
)
|
||||
)
|
||||
return os_docs, qdrant
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _emit_event(document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str) -> None:
|
||||
with session_scope() as db:
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=stage,
|
||||
level="INFO",
|
||||
message=message,
|
||||
data={},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _fail(
|
||||
document_id: uuid.UUID, run_id: uuid.UUID | None, stage: str, message: str
|
||||
) -> dict[str, Any]:
|
||||
with session_scope() as db:
|
||||
doc = db.get(Document, document_id)
|
||||
if doc is not None:
|
||||
doc.status = stage
|
||||
doc.error_message = message[:2000]
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
run_id=run_id,
|
||||
document_id=document_id,
|
||||
stage=stage,
|
||||
level="ERROR",
|
||||
message=message,
|
||||
data={},
|
||||
)
|
||||
)
|
||||
logger.error("pipeline.failed", document_id=str(document_id), stage=stage, message=message)
|
||||
return {"status": stage, "error": message}
|
||||
41
app/ingestion/quality.py
Normal file
41
app/ingestion/quality.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Quality flag computation for chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from app.utils.text_cleaning import looks_garbled
|
||||
|
||||
LOW_OCR_CONFIDENCE_THRESHOLD = 0.6
|
||||
SHORT_TEXT_THRESHOLD = 24
|
||||
|
||||
|
||||
def compute_quality_flags(
|
||||
*,
|
||||
text: str,
|
||||
block_type: str,
|
||||
ocr_confidence: float | None,
|
||||
has_handwriting: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
flags: dict[str, Any] = {
|
||||
"low_ocr_confidence": False,
|
||||
"very_short_text": False,
|
||||
"possible_garbled_text": False,
|
||||
"table_detected": block_type == "table",
|
||||
"figure_detected": block_type in ("figure_caption", "figure_description"),
|
||||
"handwriting_detected": has_handwriting or block_type == "handwriting",
|
||||
"needs_manual_review": False,
|
||||
}
|
||||
if ocr_confidence is not None and ocr_confidence < LOW_OCR_CONFIDENCE_THRESHOLD:
|
||||
flags["low_ocr_confidence"] = True
|
||||
if text and len(text.strip()) < SHORT_TEXT_THRESHOLD:
|
||||
flags["very_short_text"] = True
|
||||
if looks_garbled(text):
|
||||
flags["possible_garbled_text"] = True
|
||||
if (
|
||||
flags["low_ocr_confidence"]
|
||||
or flags["possible_garbled_text"]
|
||||
or flags["handwriting_detected"]
|
||||
):
|
||||
flags["needs_manual_review"] = True
|
||||
return flags
|
||||
184
app/ingestion/scanner.py
Normal file
184
app/ingestion/scanner.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Folder scanner: discovers PDFs, deduplicates by SHA256, persists discovery rows.
|
||||
|
||||
The scanner does NOT trigger OCR or extraction. It only:
|
||||
- enumerates PDF files,
|
||||
- hashes each file,
|
||||
- creates / reuses a ``Document`` row,
|
||||
- uploads the original PDF to MinIO,
|
||||
- emits ``DISCOVERED`` / ``STORED_ORIGINAL`` events.
|
||||
|
||||
Heavy work (OCR, Docling, indexing) is performed by the Celery worker pipeline.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import uuid
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import (
|
||||
ArtifactType,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentStatus,
|
||||
ProcessingEvent,
|
||||
)
|
||||
from app.db.session import session_scope
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_original_pdf
|
||||
from app.storage.minio_client import get_storage
|
||||
from app.utils.hashing import sha256_file
|
||||
from app.utils.pdf import is_pdf
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscoveryRecord:
|
||||
path: Path
|
||||
sha256: str | None
|
||||
document_id: uuid.UUID | None
|
||||
duplicate: bool
|
||||
invalid: bool = False
|
||||
|
||||
|
||||
def iter_pdf_files(root: Path, recursive: bool = True) -> Iterator[Path]:
|
||||
if root.is_file():
|
||||
if is_pdf(root):
|
||||
yield root
|
||||
return
|
||||
if recursive:
|
||||
for dirpath, _dirnames, filenames in os.walk(root):
|
||||
for name in filenames:
|
||||
p = Path(dirpath) / name
|
||||
if is_pdf(p):
|
||||
yield p
|
||||
else:
|
||||
for p in root.iterdir():
|
||||
if is_pdf(p):
|
||||
yield p
|
||||
|
||||
|
||||
def discover_documents(
|
||||
root: Path, recursive: bool = True, force: bool = False
|
||||
) -> Iterator[DiscoveryRecord]:
|
||||
storage = get_storage()
|
||||
storage.ensure_buckets()
|
||||
|
||||
for path in iter_pdf_files(root, recursive=recursive):
|
||||
try:
|
||||
stat = path.stat()
|
||||
sha = sha256_file(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("scan.invalid_file", path=str(path), error=str(exc))
|
||||
yield DiscoveryRecord(path=path, sha256=None, document_id=None, duplicate=False, invalid=True)
|
||||
continue
|
||||
|
||||
with session_scope() as db:
|
||||
existing = db.execute(
|
||||
select(Document).where(Document.sha256 == sha)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if existing and not force:
|
||||
logger.debug("scan.duplicate", path=str(path), sha256=sha, document_id=str(existing.id))
|
||||
yield DiscoveryRecord(path=path, sha256=sha, document_id=existing.id, duplicate=True)
|
||||
continue
|
||||
|
||||
doc = existing or Document(
|
||||
id=uuid.uuid4(),
|
||||
source_path=str(path),
|
||||
original_file_name=path.name,
|
||||
sha256=sha,
|
||||
file_size_bytes=stat.st_size,
|
||||
mime_type="application/pdf",
|
||||
status=DocumentStatus.DISCOVERED,
|
||||
)
|
||||
if not existing:
|
||||
db.add(doc)
|
||||
db.flush()
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage=DocumentStatus.DISCOVERED,
|
||||
level="INFO",
|
||||
message="Document discovered",
|
||||
data={"sha256": sha, "size": stat.st_size, "path": str(path)},
|
||||
)
|
||||
)
|
||||
|
||||
# Upload original (idempotent) and record artifact if missing.
|
||||
key = key_original_pdf(doc.id, sha)
|
||||
try:
|
||||
if not storage.exists(storage.originals_bucket, key):
|
||||
storage.put_file(
|
||||
bucket=storage.originals_bucket,
|
||||
key=key,
|
||||
path=path,
|
||||
content_type="application/pdf",
|
||||
metadata={"sha256": sha, "original-name": path.name[:255]},
|
||||
)
|
||||
_ensure_artifact(
|
||||
db,
|
||||
doc.id,
|
||||
ArtifactType.ORIGINAL_PDF,
|
||||
storage.originals_bucket,
|
||||
key,
|
||||
sha,
|
||||
)
|
||||
if doc.status == DocumentStatus.DISCOVERED:
|
||||
doc.status = DocumentStatus.STORED_ORIGINAL
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage=DocumentStatus.STORED_ORIGINAL,
|
||||
level="INFO",
|
||||
message="Original stored to MinIO",
|
||||
data={"bucket": storage.originals_bucket, "key": key},
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("scan.store_failed", path=str(path), error=str(exc))
|
||||
doc.status = DocumentStatus.FAILED
|
||||
doc.error_message = f"store_original: {exc}"
|
||||
db.add(
|
||||
ProcessingEvent(
|
||||
document_id=doc.id,
|
||||
stage="STORE_FAILED",
|
||||
level="ERROR",
|
||||
message=str(exc),
|
||||
data={"path": str(path)},
|
||||
)
|
||||
)
|
||||
yield DiscoveryRecord(path=path, sha256=sha, document_id=None, duplicate=False, invalid=True)
|
||||
continue
|
||||
|
||||
yield DiscoveryRecord(
|
||||
path=path, sha256=sha, document_id=doc.id, duplicate=bool(existing)
|
||||
)
|
||||
|
||||
|
||||
def _ensure_artifact(
|
||||
db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
|
||||
) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.artifact_type == artifact_type,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
checksum=checksum,
|
||||
)
|
||||
)
|
||||
84
app/ingestion/table_processor.py
Normal file
84
app/ingestion/table_processor.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Persists Docling tables to PostgreSQL + MinIO."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Table
|
||||
from app.ingestion.docling_extractor import ExtractedTable
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_table_json
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_tables(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
tables: list[ExtractedTable],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for t in tables:
|
||||
existing = db.execute(
|
||||
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Table(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(t.page_number),
|
||||
page_number=t.page_number,
|
||||
table_index=t.table_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.markdown = t.markdown or ""
|
||||
existing.csv_text = t.csv_text
|
||||
existing.json_data = t.json_data
|
||||
existing.summary = _summary(t)
|
||||
db.flush()
|
||||
|
||||
# Persist json blob to MinIO for large/inspectable copies.
|
||||
if t.json_data:
|
||||
key = key_table_json(document_id, t.table_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
)
|
||||
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _summary(t: ExtractedTable) -> str:
|
||||
md = t.markdown or ""
|
||||
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
||||
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
61
app/logging_config.py
Normal file
61
app/logging_config.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""Structured logging via structlog with stdlib bridge.
|
||||
|
||||
All modules use ``get_logger(__name__)`` and emit key/value pairs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def configure_logging() -> None:
|
||||
level = getattr(logging, settings.app_log_level.upper(), logging.INFO)
|
||||
|
||||
timestamper = structlog.processors.TimeStamper(fmt="iso", utc=True)
|
||||
|
||||
shared_processors: list[Any] = [
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
timestamper,
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
]
|
||||
|
||||
structlog.configure(
|
||||
processors=shared_processors
|
||||
+ [structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
formatter = structlog.stdlib.ProcessorFormatter(
|
||||
foreign_pre_chain=shared_processors,
|
||||
processors=[
|
||||
structlog.stdlib.ProcessorFormatter.remove_processors_meta,
|
||||
structlog.processors.JSONRenderer(),
|
||||
],
|
||||
)
|
||||
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
root = logging.getLogger()
|
||||
root.handlers.clear()
|
||||
root.addHandler(handler)
|
||||
root.setLevel(level)
|
||||
|
||||
# Quiet down noisy libs
|
||||
for noisy in ("urllib3", "botocore", "s3transfer", "elasticsearch", "opensearch", "httpx"):
|
||||
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
|
||||
return structlog.get_logger(name)
|
||||
52
app/main.py
Normal file
52
app/main.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""FastAPI entrypoint."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncIterator
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from app import __version__
|
||||
from app.api import routes_health, routes_ingestion, routes_search
|
||||
from app.config import settings
|
||||
from app.logging_config import configure_logging, get_logger
|
||||
|
||||
configure_logging()
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
||||
logger.info("api.startup", version=__version__, prefix=settings.app_api_prefix)
|
||||
# Best-effort bootstrap of MinIO buckets - non-fatal if it fails (health will reflect).
|
||||
try:
|
||||
from app.storage.minio_client import get_storage
|
||||
|
||||
get_storage().ensure_buckets()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("api.startup.minio_bootstrap_failed", error=str(exc))
|
||||
yield
|
||||
logger.info("api.shutdown")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="LegacyHUB",
|
||||
description="Hybrid lexical + semantic search over legacy PDF archives",
|
||||
version=__version__,
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.include_router(routes_health.router, prefix=settings.app_api_prefix)
|
||||
app.include_router(routes_ingestion.router, prefix=settings.app_api_prefix)
|
||||
app.include_router(routes_search.router, prefix=settings.app_api_prefix)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
def root() -> dict[str, str]:
|
||||
return {
|
||||
"service": "LegacyHUB",
|
||||
"version": __version__,
|
||||
"api": settings.app_api_prefix,
|
||||
"docs": "/docs",
|
||||
}
|
||||
3
app/storage/__init__.py
Normal file
3
app/storage/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.storage.minio_client import MinioStorage, get_storage
|
||||
|
||||
__all__ = ["MinioStorage", "get_storage"]
|
||||
42
app/storage/local_paths.py
Normal file
42
app/storage/local_paths.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Storage key conventions for MinIO and local working paths."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def work_dir_for(document_id: uuid.UUID | str) -> Path:
|
||||
p = Path(settings.app_work_dir) / str(document_id)
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
|
||||
def key_original_pdf(document_id: uuid.UUID | str, sha256: str) -> str:
|
||||
return f"docs/{document_id}/original/{sha256}.pdf"
|
||||
|
||||
|
||||
def key_ocr_pdf(document_id: uuid.UUID | str) -> str:
|
||||
return f"docs/{document_id}/ocr/ocr.pdf"
|
||||
|
||||
|
||||
def key_docling_json(document_id: uuid.UUID | str) -> str:
|
||||
return f"docs/{document_id}/docling/document.json"
|
||||
|
||||
|
||||
def key_markdown(document_id: uuid.UUID | str) -> str:
|
||||
return f"docs/{document_id}/docling/document.md"
|
||||
|
||||
|
||||
def key_page_image(document_id: uuid.UUID | str, page_number: int) -> str:
|
||||
return f"docs/{document_id}/pages/p{page_number:05d}.png"
|
||||
|
||||
|
||||
def key_figure_crop(document_id: uuid.UUID | str, page_number: int, figure_index: int) -> str:
|
||||
return f"docs/{document_id}/figures/p{page_number:05d}_f{figure_index:03d}.png"
|
||||
|
||||
|
||||
def key_table_json(document_id: uuid.UUID | str, table_index: int) -> str:
|
||||
return f"docs/{document_id}/tables/t{table_index:04d}.json"
|
||||
110
app/storage/minio_client.py
Normal file
110
app/storage/minio_client.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Thin wrapper around the MinIO Python SDK with bucket bootstrap and retries."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MinioStorage:
|
||||
def __init__(self, client: Minio | None = None) -> None:
|
||||
self.client = client or Minio(
|
||||
endpoint=settings.minio_endpoint,
|
||||
access_key=settings.minio_access_key,
|
||||
secret_key=settings.minio_secret_key,
|
||||
secure=settings.minio_secure,
|
||||
region=settings.minio_region,
|
||||
)
|
||||
self.originals_bucket = settings.minio_bucket_originals
|
||||
self.derived_bucket = settings.minio_bucket_derived
|
||||
|
||||
def ensure_buckets(self) -> None:
|
||||
for bucket in (self.originals_bucket, self.derived_bucket):
|
||||
if not self.client.bucket_exists(bucket):
|
||||
logger.info("minio.create_bucket", bucket=bucket)
|
||||
self.client.make_bucket(bucket)
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||
retry=retry_if_exception_type(S3Error),
|
||||
reraise=True,
|
||||
)
|
||||
def put_file(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
path: Path,
|
||||
content_type: str = "application/octet-stream",
|
||||
metadata: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
size = path.stat().st_size
|
||||
with path.open("rb") as f:
|
||||
self.client.put_object(
|
||||
bucket_name=bucket,
|
||||
object_name=key,
|
||||
data=f,
|
||||
length=size,
|
||||
content_type=content_type,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||
retry=retry_if_exception_type(S3Error),
|
||||
reraise=True,
|
||||
)
|
||||
def put_bytes(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
data: bytes,
|
||||
content_type: str = "application/octet-stream",
|
||||
metadata: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
self.client.put_object(
|
||||
bucket_name=bucket,
|
||||
object_name=key,
|
||||
data=io.BytesIO(data),
|
||||
length=len(data),
|
||||
content_type=content_type,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
|
||||
def get_to_path(self, bucket: str, key: str, dest: Path) -> Path:
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.client.fget_object(bucket, key, str(dest))
|
||||
return dest
|
||||
|
||||
def exists(self, bucket: str, key: str) -> bool:
|
||||
try:
|
||||
self.client.stat_object(bucket, key)
|
||||
return True
|
||||
except S3Error as exc:
|
||||
if exc.code in {"NoSuchKey", "NoSuchObject"}:
|
||||
return False
|
||||
raise
|
||||
|
||||
def health(self) -> dict[str, Any]:
|
||||
try:
|
||||
buckets = [b.name for b in self.client.list_buckets()]
|
||||
return {"status": "ok", "buckets": buckets}
|
||||
except Exception as exc:
|
||||
return {"status": "error", "error": str(exc)}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_storage() -> MinioStorage:
|
||||
return MinioStorage()
|
||||
0
app/utils/__init__.py
Normal file
0
app/utils/__init__.py
Normal file
21
app/utils/hashing.py
Normal file
21
app/utils/hashing.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Streaming SHA256 hashing utilities for large files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
_CHUNK = 1024 * 1024 # 1 MiB
|
||||
|
||||
|
||||
def sha256_file(path: Path | str) -> str:
|
||||
"""Compute SHA256 of a file in streaming mode (constant memory)."""
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for block in iter(lambda: f.read(_CHUNK), b""):
|
||||
h.update(block)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def sha256_bytes(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
24
app/utils/language.py
Normal file
24
app/utils/language.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""Language detection helper - tolerant to short / mixed text."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from langdetect import DetectorFactory, LangDetectException, detect_langs
|
||||
|
||||
DetectorFactory.seed = 42
|
||||
|
||||
|
||||
def detect_language(text: str, min_chars: int = 40) -> str | None:
|
||||
"""Return ISO 639-1 language code or ``None`` if undetectable."""
|
||||
if not text or len(text.strip()) < min_chars:
|
||||
return None
|
||||
try:
|
||||
ranked = detect_langs(text)
|
||||
except LangDetectException:
|
||||
return None
|
||||
if not ranked:
|
||||
return None
|
||||
return ranked[0].lang
|
||||
|
||||
|
||||
def has_cyrillic(text: str) -> bool:
|
||||
return any("Ѐ" <= ch <= "ӿ" for ch in text)
|
||||
36
app/utils/pdf.py
Normal file
36
app/utils/pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""PDF inspection helpers - decide whether OCR is required."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pikepdf
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
|
||||
def page_count(path: Path | str) -> int:
|
||||
with pikepdf.open(str(path)) as pdf:
|
||||
return len(pdf.pages)
|
||||
|
||||
|
||||
def has_searchable_text(path: Path | str, sample_pages: int = 3, min_chars: int = 80) -> bool:
|
||||
"""Cheap check: extract text from first ``sample_pages`` and require ``min_chars``.
|
||||
|
||||
Returns False on any extraction error - safer to OCR than to skip.
|
||||
"""
|
||||
try:
|
||||
text = extract_text(str(path), maxpages=sample_pages) or ""
|
||||
except Exception:
|
||||
return False
|
||||
return len(text.strip()) >= min_chars
|
||||
|
||||
|
||||
def is_pdf(path: Path | str) -> bool:
|
||||
p = Path(path)
|
||||
if not p.is_file() or p.suffix.lower() != ".pdf":
|
||||
return False
|
||||
try:
|
||||
with open(p, "rb") as f:
|
||||
return f.read(5) == b"%PDF-"
|
||||
except OSError:
|
||||
return False
|
||||
69
app/utils/text_cleaning.py
Normal file
69
app/utils/text_cleaning.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""Conservative OCR text cleaning.
|
||||
|
||||
Goals:
|
||||
- Drop hyphenation across line breaks (``инвен-\\nтарный`` -> ``инвентарный``).
|
||||
- Collapse runs of whitespace.
|
||||
- Strip control chars.
|
||||
- Preserve all non-letter characters that may carry meaning in legacy/technical
|
||||
documents: digits, punctuation, slashes, dashes, dots, parentheses, etc.
|
||||
|
||||
We do NOT lowercase, transliterate, or strip punctuation here. ``normalize_for_search``
|
||||
produces a more aggressive form for indexing, but the original ``text`` is always
|
||||
kept untouched for citation/display.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
|
||||
_SOFT_HYPHEN = ""
|
||||
_MULTI_WS = re.compile(r"[ \t ]+")
|
||||
_MULTI_NL = re.compile(r"\n{3,}")
|
||||
_HYPHEN_LINEBREAK = re.compile(r"(\w)[-‐‑‒–]\n(\w)")
|
||||
_TRAILING_WS = re.compile(r"[ \t]+\n")
|
||||
|
||||
|
||||
def clean_ocr_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
# Normalize unicode (NFC) to merge combining marks.
|
||||
text = unicodedata.normalize("NFC", text)
|
||||
text = text.replace(_SOFT_HYPHEN, "")
|
||||
text = _CONTROL_CHARS.sub("", text)
|
||||
text = _HYPHEN_LINEBREAK.sub(r"\1\2", text)
|
||||
text = _TRAILING_WS.sub("\n", text)
|
||||
text = _MULTI_WS.sub(" ", text)
|
||||
text = _MULTI_NL.sub("\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
_PUNCT_RUN = re.compile(r"[^\w\s/\-.,№#:()\[\]]+", flags=re.UNICODE)
|
||||
_WS_RUN = re.compile(r"\s+")
|
||||
|
||||
|
||||
def normalize_for_search(text: str) -> str:
|
||||
"""Lowercase + light normalization for full-text indexing.
|
||||
|
||||
Preserves digits, alphanumerics, slashes, dashes, dots, commas, ``№``, ``#``,
|
||||
colons and brackets - all of which appear in document/serial/standard codes.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = clean_ocr_text(text)
|
||||
text = text.lower()
|
||||
text = _PUNCT_RUN.sub(" ", text)
|
||||
text = _WS_RUN.sub(" ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def looks_garbled(text: str, threshold: float = 0.35) -> bool:
|
||||
"""Heuristic: ratio of non-alphanumeric, non-whitespace chars."""
|
||||
if not text:
|
||||
return False
|
||||
total = len(text)
|
||||
if total < 20:
|
||||
return False
|
||||
bad = sum(1 for c in text if not (c.isalnum() or c.isspace() or c in ".,;:!?-/()[]№#"))
|
||||
return (bad / total) > threshold
|
||||
0
app/workers/__init__.py
Normal file
0
app/workers/__init__.py
Normal file
28
app/workers/celery_app.py
Normal file
28
app/workers/celery_app.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Celery application instance."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from celery import Celery
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import configure_logging
|
||||
|
||||
configure_logging()
|
||||
|
||||
celery_app = Celery(
|
||||
"legacyhub",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
include=["app.workers.tasks"],
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_acks_late=True,
|
||||
task_reject_on_worker_lost=True,
|
||||
task_track_started=True,
|
||||
worker_prefetch_multiplier=1,
|
||||
task_time_limit=settings.max_document_timeout_seconds * 4,
|
||||
task_soft_time_limit=settings.max_document_timeout_seconds * 3,
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
)
|
||||
22
app/workers/tasks.py
Normal file
22
app/workers/tasks.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""Celery tasks - thin wrappers over pipeline functions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from celery.utils.log import get_task_logger
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
logger = get_task_logger(__name__)
|
||||
|
||||
|
||||
@celery_app.task(name="legacyhub.process_document", bind=True, max_retries=2, default_retry_delay=30)
|
||||
def process_document(self, document_id: str, run_id: str | None = None) -> dict:
|
||||
from app.ingestion.pipeline import process_document_id
|
||||
|
||||
try:
|
||||
return process_document_id(uuid.UUID(document_id), uuid.UUID(run_id) if run_id else None)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("worker.process_failed", extra={"document_id": document_id})
|
||||
raise self.retry(exc=exc) from exc
|
||||
Reference in New Issue
Block a user