chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

55
app/db/migrations/env.py Normal file
View File

@@ -0,0 +1,55 @@
"""Alembic environment - online & offline migrations using app config."""
from __future__ import annotations
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
from app.config import settings
from app.db.models import Base
config = context.config
config.set_main_option("sqlalchemy.url", settings.database_url)
if config.config_file_name is not None:
fileConfig(config.config_file_name)
target_metadata = Base.metadata
def run_migrations_offline() -> None:
context.configure(
url=settings.database_url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
section = config.get_section(config.config_ini_section, {})
section["sqlalchemy.url"] = settings.database_url
connectable = engine_from_config(
section,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@@ -0,0 +1,27 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from __future__ import annotations
from collections.abc import Sequence
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: str | None = ${repr(down_revision)}
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,171 @@
"""initial schema
Revision ID: 0001_initial
Revises:
Create Date: 2026-05-10
"""
from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0001_initial"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"documents",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("source_path", sa.Text, nullable=False),
sa.Column("original_file_name", sa.Text, nullable=False),
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
sa.Column("language_hint", sa.Text, nullable=True),
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
sa.Column("error_message", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_documents_status", "documents", ["status"])
op.create_index("ix_documents_sha256", "documents", ["sha256"])
op.create_table(
"document_artifacts",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("artifact_type", sa.String(64), nullable=False),
sa.Column("storage_bucket", sa.Text, nullable=False),
sa.Column("storage_key", sa.Text, nullable=False),
sa.Column("page_number", sa.Integer, nullable=True),
sa.Column("checksum", sa.String(64), nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
op.create_table(
"pages",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("text", sa.Text, nullable=False, server_default=""),
sa.Column("ocr_confidence", sa.Float, nullable=True),
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
)
op.create_table(
"chunks",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("block_id", sa.Text, nullable=True),
sa.Column("chunk_index", sa.Integer, nullable=False),
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
sa.Column("text", sa.Text, nullable=False),
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
sa.Column("token_count", sa.Integer, nullable=True),
sa.Column("ocr_confidence", sa.Float, nullable=True),
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
)
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
op.create_table(
"tables",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("table_index", sa.Integer, nullable=False),
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
sa.Column("csv_text", sa.Text, nullable=True),
sa.Column("json_data", postgresql.JSONB, nullable=True),
sa.Column("summary", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
)
op.create_table(
"figures",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
sa.Column("page_id", postgresql.UUID(as_uuid=True),
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
sa.Column("page_number", sa.Integer, nullable=False),
sa.Column("figure_index", sa.Integer, nullable=False),
sa.Column("caption", sa.Text, nullable=True),
sa.Column("description", sa.Text, nullable=True),
sa.Column("storage_bucket", sa.Text, nullable=True),
sa.Column("storage_key", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
)
op.create_table(
"ingestion_runs",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
sa.Column("source_folder", sa.Text, nullable=False),
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
)
op.create_table(
"processing_events",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("stage", sa.String(64), nullable=False),
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
sa.Column("message", sa.Text, nullable=False),
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_events_doc", "processing_events", ["document_id"])
op.create_index("ix_events_run", "processing_events", ["run_id"])
op.create_index("ix_events_stage", "processing_events", ["stage"])
def downgrade() -> None:
op.drop_index("ix_events_stage", table_name="processing_events")
op.drop_index("ix_events_run", table_name="processing_events")
op.drop_index("ix_events_doc", table_name="processing_events")
op.drop_table("processing_events")
op.drop_table("ingestion_runs")
op.drop_table("figures")
op.drop_table("tables")
op.drop_index("ix_chunks_block_type", table_name="chunks")
op.drop_index("ix_chunks_doc_page", table_name="chunks")
op.drop_table("chunks")
op.drop_table("pages")
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
op.drop_table("document_artifacts")
op.drop_index("ix_documents_sha256", table_name="documents")
op.drop_index("ix_documents_status", table_name="documents")
op.drop_table("documents")