chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
3
app/db/__init__.py
Normal file
3
app/db/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.db.models import Base
|
||||
|
||||
__all__ = ["Base"]
|
||||
55
app/db/migrations/env.py
Normal file
55
app/db/migrations/env.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Alembic environment - online & offline migrations using app config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy import engine_from_config, pool
|
||||
|
||||
from app.config import settings
|
||||
from app.db.models import Base
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
context.configure(
|
||||
url=settings.database_url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
section = config.get_section(config.config_ini_section, {})
|
||||
section["sqlalchemy.url"] = settings.database_url
|
||||
connectable = engine_from_config(
|
||||
section,
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
compare_type=True,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
27
app/db/migrations/script.py.mako
Normal file
27
app/db/migrations/script.py.mako
Normal file
@@ -0,0 +1,27 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: str | None = ${repr(down_revision)}
|
||||
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
|
||||
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
171
app/db/migrations/versions/0001_initial.py
Normal file
171
app/db/migrations/versions/0001_initial.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""initial schema
|
||||
|
||||
Revision ID: 0001_initial
|
||||
Revises:
|
||||
Create Date: 2026-05-10
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0001_initial"
|
||||
down_revision: str | None = None
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"documents",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("source_path", sa.Text, nullable=False),
|
||||
sa.Column("original_file_name", sa.Text, nullable=False),
|
||||
sa.Column("sha256", sa.String(64), nullable=False, unique=True),
|
||||
sa.Column("file_size_bytes", sa.BigInteger, nullable=False),
|
||||
sa.Column("mime_type", sa.Text, nullable=False, server_default="application/pdf"),
|
||||
sa.Column("language_hint", sa.Text, nullable=True),
|
||||
sa.Column("status", sa.String(64), nullable=False, server_default="DISCOVERED"),
|
||||
sa.Column("error_message", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_documents_status", "documents", ["status"])
|
||||
op.create_index("ix_documents_sha256", "documents", ["sha256"])
|
||||
|
||||
op.create_table(
|
||||
"document_artifacts",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("artifact_type", sa.String(64), nullable=False),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=False),
|
||||
sa.Column("storage_key", sa.Text, nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=True),
|
||||
sa.Column("checksum", sa.String(64), nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_artifacts_doc_type", "document_artifacts", ["document_id", "artifact_type"])
|
||||
|
||||
op.create_table(
|
||||
"pages",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("has_tables", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_figures", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("has_handwriting", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"chunks",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("block_id", sa.Text, nullable=True),
|
||||
sa.Column("chunk_index", sa.Integer, nullable=False),
|
||||
sa.Column("block_type", sa.String(32), nullable=False, server_default="paragraph"),
|
||||
sa.Column("text", sa.Text, nullable=False),
|
||||
sa.Column("normalized_text", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("token_count", sa.Integer, nullable=True),
|
||||
sa.Column("ocr_confidence", sa.Float, nullable=True),
|
||||
sa.Column("quality_flags", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
)
|
||||
op.create_index("ix_chunks_doc_page", "chunks", ["document_id", "page_number"])
|
||||
op.create_index("ix_chunks_block_type", "chunks", ["block_type"])
|
||||
|
||||
op.create_table(
|
||||
"tables",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("table_index", sa.Integer, nullable=False),
|
||||
sa.Column("markdown", sa.Text, nullable=False, server_default=""),
|
||||
sa.Column("csv_text", sa.Text, nullable=True),
|
||||
sa.Column("json_data", postgresql.JSONB, nullable=True),
|
||||
sa.Column("summary", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"figures",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"), nullable=False),
|
||||
sa.Column("page_id", postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("pages.id", ondelete="SET NULL"), nullable=True),
|
||||
sa.Column("page_number", sa.Integer, nullable=False),
|
||||
sa.Column("figure_index", sa.Integer, nullable=False),
|
||||
sa.Column("caption", sa.Text, nullable=True),
|
||||
sa.Column("description", sa.Text, nullable=True),
|
||||
sa.Column("storage_bucket", sa.Text, nullable=True),
|
||||
sa.Column("storage_key", sa.Text, nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"ingestion_runs",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
sa.Column("finished_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("status", sa.String(32), nullable=False, server_default="RUNNING"),
|
||||
sa.Column("source_folder", sa.Text, nullable=False),
|
||||
sa.Column("total_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("processed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("failed_files", sa.Integer, nullable=False, server_default="0"),
|
||||
sa.Column("metadata", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"processing_events",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("run_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("document_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("stage", sa.String(64), nullable=False),
|
||||
sa.Column("level", sa.String(16), nullable=False, server_default="INFO"),
|
||||
sa.Column("message", sa.Text, nullable=False),
|
||||
sa.Column("data", postgresql.JSONB, nullable=False, server_default=sa.text("'{}'::jsonb")),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_events_doc", "processing_events", ["document_id"])
|
||||
op.create_index("ix_events_run", "processing_events", ["run_id"])
|
||||
op.create_index("ix_events_stage", "processing_events", ["stage"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_events_stage", table_name="processing_events")
|
||||
op.drop_index("ix_events_run", table_name="processing_events")
|
||||
op.drop_index("ix_events_doc", table_name="processing_events")
|
||||
op.drop_table("processing_events")
|
||||
op.drop_table("ingestion_runs")
|
||||
op.drop_table("figures")
|
||||
op.drop_table("tables")
|
||||
op.drop_index("ix_chunks_block_type", table_name="chunks")
|
||||
op.drop_index("ix_chunks_doc_page", table_name="chunks")
|
||||
op.drop_table("chunks")
|
||||
op.drop_table("pages")
|
||||
op.drop_index("ix_artifacts_doc_type", table_name="document_artifacts")
|
||||
op.drop_table("document_artifacts")
|
||||
op.drop_index("ix_documents_sha256", table_name="documents")
|
||||
op.drop_index("ix_documents_status", table_name="documents")
|
||||
op.drop_table("documents")
|
||||
266
app/db/models.py
Normal file
266
app/db/models.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""SQLAlchemy ORM models for LegacyHUB."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import (
|
||||
BigInteger,
|
||||
Boolean,
|
||||
DateTime,
|
||||
Float,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
func,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
|
||||
|
||||
class DocumentStatus:
|
||||
DISCOVERED = "DISCOVERED"
|
||||
STORED_ORIGINAL = "STORED_ORIGINAL"
|
||||
OCR_STARTED = "OCR_STARTED"
|
||||
OCR_COMPLETED = "OCR_COMPLETED"
|
||||
OCR_FAILED = "OCR_FAILED"
|
||||
EXTRACTION_STARTED = "EXTRACTION_STARTED"
|
||||
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
|
||||
EXTRACTION_FAILED = "EXTRACTION_FAILED"
|
||||
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
|
||||
INDEXING_COMPLETED = "INDEXING_COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class ArtifactType:
|
||||
ORIGINAL_PDF = "original_pdf"
|
||||
OCR_PDF = "ocr_pdf"
|
||||
DOCLING_JSON = "docling_json"
|
||||
MARKDOWN = "markdown"
|
||||
PAGE_IMAGE = "page_image"
|
||||
FIGURE_CROP = "figure_crop"
|
||||
TABLE_JSON = "table_json"
|
||||
|
||||
|
||||
class BlockType:
|
||||
TITLE = "title"
|
||||
HEADING = "heading"
|
||||
PARAGRAPH = "paragraph"
|
||||
LIST = "list"
|
||||
TABLE = "table"
|
||||
FIGURE_CAPTION = "figure_caption"
|
||||
FIGURE_DESCRIPTION = "figure_description"
|
||||
HANDWRITING = "handwriting"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
# ---- Tables ----
|
||||
|
||||
class Document(Base):
|
||||
__tablename__ = "documents"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
|
||||
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
|
||||
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
status: Mapped[str] = mapped_column(
|
||||
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
|
||||
)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
|
||||
)
|
||||
|
||||
artifacts: Mapped[list[DocumentArtifact]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
pages: Mapped[list[Page]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
chunks: Mapped[list[Chunk]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
|
||||
class DocumentArtifact(Base):
|
||||
__tablename__ = "document_artifacts"
|
||||
__table_args__ = (
|
||||
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="artifacts")
|
||||
|
||||
|
||||
class Page(Base):
|
||||
__tablename__ = "pages"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="pages")
|
||||
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
|
||||
|
||||
|
||||
class Chunk(Base):
|
||||
__tablename__ = "chunks"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
Index("ix_chunks_doc_page", "document_id", "page_number"),
|
||||
Index("ix_chunks_block_type", "block_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="chunks")
|
||||
page: Mapped[Page | None] = relationship(back_populates="chunks")
|
||||
|
||||
|
||||
class Table(Base):
|
||||
__tablename__ = "tables"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
|
||||
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class Figure(Base):
|
||||
__tablename__ = "figures"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class IngestionRun(Base):
|
||||
__tablename__ = "ingestion_runs"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
started_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
|
||||
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
run_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
|
||||
|
||||
class ProcessingEvent(Base):
|
||||
__tablename__ = "processing_events"
|
||||
__table_args__ = (
|
||||
Index("ix_events_doc", "document_id"),
|
||||
Index("ix_events_run", "run_id"),
|
||||
Index("ix_events_stage", "stage"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
stage: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
|
||||
message: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
66
app/db/session.py
Normal file
66
app/db/session.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""SQLAlchemy engine and session factory."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from app.config import settings
|
||||
|
||||
_engine: Engine | None = None
|
||||
_SessionFactory: sessionmaker[Session] | None = None
|
||||
|
||||
|
||||
def get_engine() -> Engine:
|
||||
global _engine
|
||||
if _engine is None:
|
||||
_engine = create_engine(
|
||||
settings.database_url,
|
||||
pool_pre_ping=True,
|
||||
pool_size=10,
|
||||
max_overflow=20,
|
||||
future=True,
|
||||
)
|
||||
return _engine
|
||||
|
||||
|
||||
def get_session_factory() -> sessionmaker[Session]:
|
||||
global _SessionFactory
|
||||
if _SessionFactory is None:
|
||||
_SessionFactory = sessionmaker(
|
||||
bind=get_engine(),
|
||||
autoflush=False,
|
||||
autocommit=False,
|
||||
expire_on_commit=False,
|
||||
future=True,
|
||||
)
|
||||
return _SessionFactory
|
||||
|
||||
|
||||
@contextmanager
|
||||
def session_scope() -> Iterator[Session]:
|
||||
"""Provide a transactional scope: commits on success, rolls back on error."""
|
||||
factory = get_session_factory()
|
||||
session = factory()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def get_db() -> Iterator[Session]:
|
||||
"""FastAPI dependency."""
|
||||
factory = get_session_factory()
|
||||
session = factory()
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
session.close()
|
||||
Reference in New Issue
Block a user