Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
172 lines
9.0 KiB
Python
172 lines
9.0 KiB
Python
"""initial schema
|
|
|
|
Revision ID: 0001_initial
|
|
Revises:
|
|
Create Date: 2026-05-10
|
|
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Sequence
|
|
|
|
import sqlalchemy as sa
|
|
from alembic import op
|
|
from sqlalchemy.dialects import postgresql
|
|
|
|
revision: str = "0001_initial"
|
|
down_revision: str | None = None
|
|
branch_labels: str | Sequence[str] | None = None
|
|
depends_on: str | Sequence[str] | None = None
|
|
|
|
|
|
def upgrade() -> None:
|
|
op.create_table(
|
|
"documents",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("source_path", sa.Text, nullable=False),
|
|
sa.Column("original_file_name", sa.Text, nullable=False),
|
|
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
|
|
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
|
|
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
|
|
sa.Column("language_hint", sa.Text, nullable=True),
|
|
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
|
|
sa.Column("error_message", sa.Text, nullable=True),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
)
|
|
op.create_index("ix_documents_status", "documents", ["status"])
|
|
op.create_index("ix_documents_sha256", "documents", ["sha256"])
|
|
|
|
op.create_table(
|
|
"document_artifacts",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
|
sa.Column("artifact_type", sa.String(64), nullable=False),
|
|
sa.Column("storage_bucket", sa.Text, nullable=False),
|
|
sa.Column("storage_key", sa.Text, nullable=False),
|
|
sa.Column("page_number", sa.Integer, nullable=True),
|
|
sa.Column("checksum", sa.String(64), nullable=True),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
)
|
|
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
|
|
|
|
op.create_table(
|
|
"pages",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
|
sa.Column("page_number", sa.Integer, nullable=False),
|
|
sa.Column("text", sa.Text, nullable=False, server_default=""),
|
|
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
|
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
|
|
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
|
|
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
|
)
|
|
|
|
op.create_table(
|
|
"chunks",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
|
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
|
sa.Column("page_number", sa.Integer, nullable=False),
|
|
sa.Column("block_id", sa.Text, nullable=True),
|
|
sa.Column("chunk_index", sa.Integer, nullable=False),
|
|
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
|
|
sa.Column("text", sa.Text, nullable=False),
|
|
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
|
|
sa.Column("token_count", sa.Integer, nullable=True),
|
|
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
|
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
|
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
|
)
|
|
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
|
|
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
|
|
|
|
op.create_table(
|
|
"tables",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
|
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
|
sa.Column("page_number", sa.Integer, nullable=False),
|
|
sa.Column("table_index", sa.Integer, nullable=False),
|
|
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
|
|
sa.Column("csv_text", sa.Text, nullable=True),
|
|
sa.Column("json_data", postgresql.JSONB, nullable=True),
|
|
sa.Column("summary", sa.Text, nullable=True),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
|
)
|
|
|
|
op.create_table(
|
|
"figures",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
|
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
|
sa.Column("page_number", sa.Integer, nullable=False),
|
|
sa.Column("figure_index", sa.Integer, nullable=False),
|
|
sa.Column("caption", sa.Text, nullable=True),
|
|
sa.Column("description", sa.Text, nullable=True),
|
|
sa.Column("storage_bucket", sa.Text, nullable=True),
|
|
sa.Column("storage_key", sa.Text, nullable=True),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
|
)
|
|
|
|
op.create_table(
|
|
"ingestion_runs",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
|
|
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
|
|
sa.Column("source_folder", sa.Text, nullable=False),
|
|
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
|
|
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
|
|
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
|
|
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
|
)
|
|
|
|
op.create_table(
|
|
"processing_events",
|
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
|
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
|
|
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
|
|
sa.Column("stage", sa.String(64), nullable=False),
|
|
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
|
|
sa.Column("message", sa.Text, nullable=False),
|
|
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
|
)
|
|
op.create_index("ix_events_doc", "processing_events", ["document_id"])
|
|
op.create_index("ix_events_run", "processing_events", ["run_id"])
|
|
op.create_index("ix_events_stage", "processing_events", ["stage"])
|
|
|
|
|
|
def downgrade() -> None:
|
|
op.drop_index("ix_events_stage", table_name="processing_events")
|
|
op.drop_index("ix_events_run", table_name="processing_events")
|
|
op.drop_index("ix_events_doc", table_name="processing_events")
|
|
op.drop_table("processing_events")
|
|
op.drop_table("ingestion_runs")
|
|
op.drop_table("figures")
|
|
op.drop_table("tables")
|
|
op.drop_index("ix_chunks_block_type", table_name="chunks")
|
|
op.drop_index("ix_chunks_doc_page", table_name="chunks")
|
|
op.drop_table("chunks")
|
|
op.drop_table("pages")
|
|
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
|
|
op.drop_table("document_artifacts")
|
|
op.drop_index("ix_documents_sha256", table_name="documents")
|
|
op.drop_index("ix_documents_status", table_name="documents")
|
|
op.drop_table("documents")
|