chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

266
app/db/models.py Normal file
View File

@@ -0,0 +1,266 @@
"""SQLAlchemy ORM models for LegacyHUB."""
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Any
from sqlalchemy import (
BigInteger,
Boolean,
DateTime,
Float,
ForeignKey,
Index,
Integer,
String,
Text,
UniqueConstraint,
func,
)
from sqlalchemy.dialects.postgresql import JSONB, UUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
pass
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
class DocumentStatus:
DISCOVERED = "DISCOVERED"
STORED_ORIGINAL = "STORED_ORIGINAL"
OCR_STARTED = "OCR_STARTED"
OCR_COMPLETED = "OCR_COMPLETED"
OCR_FAILED = "OCR_FAILED"
EXTRACTION_STARTED = "EXTRACTION_STARTED"
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
EXTRACTION_FAILED = "EXTRACTION_FAILED"
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
INDEXING_COMPLETED = "INDEXING_COMPLETED"
FAILED = "FAILED"
class ArtifactType:
ORIGINAL_PDF = "original_pdf"
OCR_PDF = "ocr_pdf"
DOCLING_JSON = "docling_json"
MARKDOWN = "markdown"
PAGE_IMAGE = "page_image"
FIGURE_CROP = "figure_crop"
TABLE_JSON = "table_json"
class BlockType:
TITLE = "title"
HEADING = "heading"
PARAGRAPH = "paragraph"
LIST = "list"
TABLE = "table"
FIGURE_CAPTION = "figure_caption"
FIGURE_DESCRIPTION = "figure_description"
HANDWRITING = "handwriting"
UNKNOWN = "unknown"
# ---- Tables ----
class Document(Base):
__tablename__ = "documents"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
source_path: Mapped[str] = mapped_column(Text, nullable=False)
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
status: Mapped[str] = mapped_column(
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
)
artifacts: Mapped[list[DocumentArtifact]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
pages: Mapped[list[Page]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
chunks: Mapped[list[Chunk]] = relationship(
back_populates="document", cascade="all, delete-orphan"
)
class DocumentArtifact(Base):
__tablename__ = "document_artifacts"
__table_args__ = (
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="artifacts")
class Page(Base):
__tablename__ = "pages"
__table_args__ = (
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="pages")
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
class Chunk(Base):
__tablename__ = "chunks"
__table_args__ = (
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
Index("ix_chunks_doc_page", "document_id", "page_number"),
Index("ix_chunks_block_type", "block_type"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
text: Mapped[str] = mapped_column(Text, nullable=False)
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, nullable=False, default=dict
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
document: Mapped[Document] = relationship(back_populates="chunks")
page: Mapped[Page | None] = relationship(back_populates="chunks")
class Table(Base):
__tablename__ = "tables"
__table_args__ = (
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
class Figure(Base):
__tablename__ = "figures"
__table_args__ = (
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
page_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
)
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
class IngestionRun(Base):
__tablename__ = "ingestion_runs"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
started_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
run_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, nullable=False, default=dict
)
class ProcessingEvent(Base):
__tablename__ = "processing_events"
__table_args__ = (
Index("ix_events_doc", "document_id"),
Index("ix_events_run", "run_id"),
Index("ix_events_stage", "stage"),
)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
stage: Mapped[str] = mapped_column(String(64), nullable=False)
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
message: Mapped[str] = mapped_column(Text, nullable=False)
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)