chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
55
app/db/migrations/env.py
Normal file
55
app/db/migrations/env.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Alembic environment - online & offline migrations using app config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy import engine_from_config, pool
|
||||
|
||||
from app.config import settings
|
||||
from app.db.models import Base
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
context.configure(
|
||||
url=settings.database_url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
section = config.get_section(config.config_ini_section, {})
|
||||
section["sqlalchemy.url"] = settings.database_url
|
||||
connectable = engine_from_config(
|
||||
section,
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
27
app/db/migrations/script.py.mako
Normal file
27
app/db/migrations/script.py.mako
Normal file
@@ -0,0 +1,27 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: str | None = ${repr(down_revision)}
|
||||
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
|
||||
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
171
app/db/migrations/versions/0001_initial.py
Normal file
171
app/db/migrations/versions/0001_initial.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""initial schema
|
||||
|
||||
Revision ID: 0001_initial
|
||||
Revises:
|
||||
Create Date: 2026-05-10
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0001_initial"
|
||||
down_revision: str | None = None
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"documents",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("source_path", sa.Text, nullable=False),
|
||||
sa.Column("original_file_name", sa.Text, nullable=False),
|
||||
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
|
||||
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
|
||||
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
|
||||
sa.Column("language_hint", sa.Text, nullable=True),
|
||||
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
|
||||
sa.Column("error_message", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_documents_status", "documents", ["status"])
|
||||
op.create_index("ix_documents_sha256", "documents", ["sha256"])
|
||||
|
||||
op.create_table(
|
||||
"document_artifacts",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("artifact_type", sa.String(64), nullable=False),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=False),
|
||||
sa.Column("storage_key", sa.Text, nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=True),
|
||||
sa.Column("checksum", sa.String(64), nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
|
||||
|
||||
op.create_table(
|
||||
"pages",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"chunks",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("block_id", sa.Text, nullable=True),
|
||||
sa.Column("chunk_index", sa.Integer, nullable=False),
|
||||
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
|
||||
sa.Column("text", sa.Text, nullable=False),
|
||||
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("token_count", sa.Integer, nullable=True),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
)
|
||||
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
|
||||
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
|
||||
|
||||
op.create_table(
|
||||
"tables",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("table_index", sa.Integer, nullable=False),
|
||||
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("csv_text", sa.Text, nullable=True),
|
||||
sa.Column("json_data", postgresql.JSONB, nullable=True),
|
||||
sa.Column("summary", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"figures",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("figure_index", sa.Integer, nullable=False),
|
||||
sa.Column("caption", sa.Text, nullable=True),
|
||||
sa.Column("description", sa.Text, nullable=True),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=True),
|
||||
sa.Column("storage_key", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"ingestion_runs",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
|
||||
sa.Column("source_folder", sa.Text, nullable=False),
|
||||
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"processing_events",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("stage", sa.String(64), nullable=False),
|
||||
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
|
||||
sa.Column("message", sa.Text, nullable=False),
|
||||
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_events_doc", "processing_events", ["document_id"])
|
||||
op.create_index("ix_events_run", "processing_events", ["run_id"])
|
||||
op.create_index("ix_events_stage", "processing_events", ["stage"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_events_stage", table_name="processing_events")
|
||||
op.drop_index("ix_events_run", table_name="processing_events")
|
||||
op.drop_index("ix_events_doc", table_name="processing_events")
|
||||
op.drop_table("processing_events")
|
||||
op.drop_table("ingestion_runs")
|
||||
op.drop_table("figures")
|
||||
op.drop_table("tables")
|
||||
op.drop_index("ix_chunks_block_type", table_name="chunks")
|
||||
op.drop_index("ix_chunks_doc_page", table_name="chunks")
|
||||
op.drop_table("chunks")
|
||||
op.drop_table("pages")
|
||||
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
|
||||
op.drop_table("document_artifacts")
|
||||
op.drop_index("ix_documents_sha256", table_name="documents")
|
||||
op.drop_index("ix_documents_status", table_name="documents")
|
||||
op.drop_table("documents")
|
||||
Reference in New Issue
Block a user