"""initial schema Revision ID: 0001_initial Revises: Create Date: 2026-05-10 """ from __future__ import annotations from collections.abc import Sequence import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql revision: str = "0001_initial" down_revision: str | None = None branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None def upgrade() -> None: op.create_table( "documents", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("source_path", sa.Text, nullable=False), sa.Column("original_file_name", sa.Text, nullable=False), sa.Column("sha256", sa.String(64), nullable=False, unique=True), sa.Column("file_size_bytes", sa.BigInteger, nullable=False), sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"), sa.Column("language_hint", sa.Text, nullable=True), sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"), sa.Column("error_message", sa.Text, nullable=True), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), ) op.create_index("ix_documents_status", "documents", ["status"]) op.create_index("ix_documents_sha256", "documents", ["sha256"]) op.create_table( "document_artifacts", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False), sa.Column("artifact_type", sa.String(64), nullable=False), sa.Column("storage_bucket", sa.Text, nullable=False), sa.Column("storage_key", sa.Text, nullable=False), sa.Column("page_number", sa.Integer, nullable=True), sa.Column("checksum", sa.String(64), nullable=True), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), ) op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"]) op.create_table( "pages", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False), sa.Column("page_number", sa.Integer, nullable=False), sa.Column("text", sa.Text, nullable=False, server_default=""), sa.Column("ocr_confidence", sa.Float, nullable=True), sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()), sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()), sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"), ) op.create_table( "chunks", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False), sa.Column("page_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True), sa.Column("page_number", sa.Integer, nullable=False), sa.Column("block_id", sa.Text, nullable=True), sa.Column("chunk_index", sa.Integer, nullable=False), sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"), sa.Column("text", sa.Text, nullable=False), sa.Column("normalized_text", sa.Text, nullable=False, server_default=""), sa.Column("token_count", sa.Integer, nullable=True), sa.Column("ocr_confidence", sa.Float, nullable=True), sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")), sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"), ) op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"]) op.create_index("ix_chunks_block_type", "chunks", ["block_type"]) op.create_table( "tables", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False), sa.Column("page_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True), sa.Column("page_number", sa.Integer, nullable=False), sa.Column("table_index", sa.Integer, nullable=False), sa.Column("markdown", sa.Text, nullable=False, server_default=""), sa.Column("csv_text", sa.Text, nullable=True), sa.Column("json_data", postgresql.JSONB, nullable=True), sa.Column("summary", sa.Text, nullable=True), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"), ) op.create_table( "figures", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False), sa.Column("page_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True), sa.Column("page_number", sa.Integer, nullable=False), sa.Column("figure_index", sa.Integer, nullable=False), sa.Column("caption", sa.Text, nullable=True), sa.Column("description", sa.Text, nullable=True), sa.Column("storage_bucket", sa.Text, nullable=True), sa.Column("storage_key", sa.Text, nullable=True), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"), ) op.create_table( "ingestion_runs", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True), sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"), sa.Column("source_folder", sa.Text, nullable=False), sa.Column("total_files", sa.Integer, nullable=False, server_default="0"), sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"), sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"), sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")), ) op.create_table( "processing_events", sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True), sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True), sa.Column("stage", sa.String(64), nullable=False), sa.Column("level", sa.String(16), nullable=False, server_default="INFO"), sa.Column("message", sa.Text, nullable=False), sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")), sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), ) op.create_index("ix_events_doc", "processing_events", ["document_id"]) op.create_index("ix_events_run", "processing_events", ["run_id"]) op.create_index("ix_events_stage", "processing_events", ["stage"]) def downgrade() -> None: op.drop_index("ix_events_stage", table_name="processing_events") op.drop_index("ix_events_run", table_name="processing_events") op.drop_index("ix_events_doc", table_name="processing_events") op.drop_table("processing_events") op.drop_table("ingestion_runs") op.drop_table("figures") op.drop_table("tables") op.drop_index("ix_chunks_block_type", table_name="chunks") op.drop_index("ix_chunks_doc_page", table_name="chunks") op.drop_table("chunks") op.drop_table("pages") op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts") op.drop_table("document_artifacts") op.drop_index("ix_documents_sha256", table_name="documents") op.drop_index("ix_documents_status", table_name="documents") op.drop_table("documents")