"""SQLAlchemy ORM models for LegacyHUB.""" from __future__ import annotations import uuid from datetime import datetime from typing import Any from sqlalchemy import ( BigInteger, Boolean, DateTime, Float, ForeignKey, Index, Integer, String, Text, UniqueConstraint, func, ) from sqlalchemy.dialects.postgresql import JSONB, UUID from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship class Base(DeclarativeBase): pass # ---- Status / type literals (kept as plain strings to avoid PG enum churn) ---- class DocumentStatus: DISCOVERED = "DISCOVERED" STORED_ORIGINAL = "STORED_ORIGINAL" OCR_STARTED = "OCR_STARTED" OCR_COMPLETED = "OCR_COMPLETED" OCR_FAILED = "OCR_FAILED" EXTRACTION_STARTED = "EXTRACTION_STARTED" EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED" EXTRACTION_FAILED = "EXTRACTION_FAILED" CHUNKING_COMPLETED = "CHUNKING_COMPLETED" INDEXING_COMPLETED = "INDEXING_COMPLETED" FAILED = "FAILED" class ArtifactType: ORIGINAL_PDF = "original_pdf" OCR_PDF = "ocr_pdf" DOCLING_JSON = "docling_json" MARKDOWN = "markdown" PAGE_IMAGE = "page_image" FIGURE_CROP = "figure_crop" TABLE_JSON = "table_json" class BlockType: TITLE = "title" HEADING = "heading" PARAGRAPH = "paragraph" LIST = "list" TABLE = "table" FIGURE_CAPTION = "figure_caption" FIGURE_DESCRIPTION = "figure_description" HANDWRITING = "handwriting" UNKNOWN = "unknown" # ---- Tables ---- class Document(Base): __tablename__ = "documents" id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) source_path: Mapped[str] = mapped_column(Text, nullable=False) original_file_name: Mapped[str] = mapped_column(Text, nullable=False) sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True) file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False) mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf") language_hint: Mapped[str | None] = mapped_column(Text, nullable=True) status: Mapped[str] = mapped_column( String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True ) error_message: Mapped[str | None] = mapped_column(Text, nullable=True) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) updated_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False ) artifacts: Mapped[list[DocumentArtifact]] = relationship( back_populates="document", cascade="all, delete-orphan" ) pages: Mapped[list[Page]] = relationship( back_populates="document", cascade="all, delete-orphan" ) chunks: Mapped[list[Chunk]] = relationship( back_populates="document", cascade="all, delete-orphan" ) class DocumentArtifact(Base): __tablename__ = "document_artifacts" __table_args__ = ( Index("ix_artifacts_doc_type", "document_id", "artifact_type"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id: Mapped[uuid.UUID] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False ) artifact_type: Mapped[str] = mapped_column(String(64), nullable=False) storage_bucket: Mapped[str] = mapped_column(Text, nullable=False) storage_key: Mapped[str] = mapped_column(Text, nullable=False) page_number: Mapped[int | None] = mapped_column(Integer, nullable=True) checksum: Mapped[str | None] = mapped_column(String(64), nullable=True) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) document: Mapped[Document] = relationship(back_populates="artifacts") class Page(Base): __tablename__ = "pages" __table_args__ = ( UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id: Mapped[uuid.UUID] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False ) page_number: Mapped[int] = mapped_column(Integer, nullable=False) text: Mapped[str] = mapped_column(Text, nullable=False, default="") ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True) has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) document: Mapped[Document] = relationship(back_populates="pages") chunks: Mapped[list[Chunk]] = relationship(back_populates="page") class Chunk(Base): __tablename__ = "chunks" __table_args__ = ( UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"), Index("ix_chunks_doc_page", "document_id", "page_number"), Index("ix_chunks_block_type", "block_type"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id: Mapped[uuid.UUID] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False ) page_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True ) page_number: Mapped[int] = mapped_column(Integer, nullable=False) block_id: Mapped[str | None] = mapped_column(Text, nullable=True) chunk_index: Mapped[int] = mapped_column(Integer, nullable=False) block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH) text: Mapped[str] = mapped_column(Text, nullable=False) normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="") token_count: Mapped[int | None] = mapped_column(Integer, nullable=True) ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True) quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict) chunk_metadata: Mapped[dict[str, Any]] = mapped_column( "metadata", JSONB, nullable=False, default=dict ) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) document: Mapped[Document] = relationship(back_populates="chunks") page: Mapped[Page | None] = relationship(back_populates="chunks") class Table(Base): __tablename__ = "tables" __table_args__ = ( UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id: Mapped[uuid.UUID] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False ) page_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True ) page_number: Mapped[int] = mapped_column(Integer, nullable=False) table_index: Mapped[int] = mapped_column(Integer, nullable=False) markdown: Mapped[str] = mapped_column(Text, nullable=False, default="") csv_text: Mapped[str | None] = mapped_column(Text, nullable=True) json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True) summary: Mapped[str | None] = mapped_column(Text, nullable=True) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) class Figure(Base): __tablename__ = "figures" __table_args__ = ( UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id: Mapped[uuid.UUID] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False ) page_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True ) page_number: Mapped[int] = mapped_column(Integer, nullable=False) figure_index: Mapped[int] = mapped_column(Integer, nullable=False) caption: Mapped[str | None] = mapped_column(Text, nullable=True) description: Mapped[str | None] = mapped_column(Text, nullable=True) storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True) storage_key: Mapped[str | None] = mapped_column(Text, nullable=True) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) class IngestionRun(Base): __tablename__ = "ingestion_runs" id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) started_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING") source_folder: Mapped[str] = mapped_column(Text, nullable=False) total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0) processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0) failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0) run_metadata: Mapped[dict[str, Any]] = mapped_column( "metadata", JSONB, nullable=False, default=dict ) class ProcessingEvent(Base): __tablename__ = "processing_events" __table_args__ = ( Index("ix_events_doc", "document_id"), Index("ix_events_run", "run_id"), Index("ix_events_stage", "stage"), ) id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True) document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True) stage: Mapped[str] = mapped_column(String(64), nullable=False) level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO") message: Mapped[str] = mapped_column(Text, nullable=False) data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False )