chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
266
app/db/models.py
Normal file
266
app/db/models.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""SQLAlchemy ORM models for LegacyHUB."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import (
|
||||
BigInteger,
|
||||
Boolean,
|
||||
DateTime,
|
||||
Float,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
func,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
|
||||
|
||||
class DocumentStatus:
|
||||
DISCOVERED = "DISCOVERED"
|
||||
STORED_ORIGINAL = "STORED_ORIGINAL"
|
||||
OCR_STARTED = "OCR_STARTED"
|
||||
OCR_COMPLETED = "OCR_COMPLETED"
|
||||
OCR_FAILED = "OCR_FAILED"
|
||||
EXTRACTION_STARTED = "EXTRACTION_STARTED"
|
||||
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
|
||||
EXTRACTION_FAILED = "EXTRACTION_FAILED"
|
||||
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
|
||||
INDEXING_COMPLETED = "INDEXING_COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class ArtifactType:
|
||||
ORIGINAL_PDF = "original_pdf"
|
||||
OCR_PDF = "ocr_pdf"
|
||||
DOCLING_JSON = "docling_json"
|
||||
MARKDOWN = "markdown"
|
||||
PAGE_IMAGE = "page_image"
|
||||
FIGURE_CROP = "figure_crop"
|
||||
TABLE_JSON = "table_json"
|
||||
|
||||
|
||||
class BlockType:
|
||||
TITLE = "title"
|
||||
HEADING = "heading"
|
||||
PARAGRAPH = "paragraph"
|
||||
LIST = "list"
|
||||
TABLE = "table"
|
||||
FIGURE_CAPTION = "figure_caption"
|
||||
FIGURE_DESCRIPTION = "figure_description"
|
||||
HANDWRITING = "handwriting"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
# ---- Tables ----
|
||||
|
||||
class Document(Base):
|
||||
__tablename__ = "documents"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
|
||||
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
|
||||
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
status: Mapped[str] = mapped_column(
|
||||
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
|
||||
)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
|
||||
)
|
||||
|
||||
artifacts: Mapped[list[DocumentArtifact]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
pages: Mapped[list[Page]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
chunks: Mapped[list[Chunk]] = relationship(
|
||||
back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
|
||||
class DocumentArtifact(Base):
|
||||
__tablename__ = "document_artifacts"
|
||||
__table_args__ = (
|
||||
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="artifacts")
|
||||
|
||||
|
||||
class Page(Base):
|
||||
__tablename__ = "pages"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="pages")
|
||||
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
|
||||
|
||||
|
||||
class Chunk(Base):
|
||||
__tablename__ = "chunks"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
||||
Index("ix_chunks_doc_page", "document_id", "page_number"),
|
||||
Index("ix_chunks_block_type", "block_type"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
|
||||
text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
document: Mapped[Document] = relationship(back_populates="chunks")
|
||||
page: Mapped[Page | None] = relationship(back_populates="chunks")
|
||||
|
||||
|
||||
class Table(Base):
|
||||
__tablename__ = "tables"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
|
||||
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class Figure(Base):
|
||||
__tablename__ = "figures"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
|
||||
class IngestionRun(Base):
|
||||
__tablename__ = "ingestion_runs"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
started_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
|
||||
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
run_metadata: Mapped[dict[str, Any]] = mapped_column(
|
||||
"metadata", JSONB, nullable=False, default=dict
|
||||
)
|
||||
|
||||
|
||||
class ProcessingEvent(Base):
|
||||
__tablename__ = "processing_events"
|
||||
__table_args__ = (
|
||||
Index("ix_events_doc", "document_id"),
|
||||
Index("ix_events_run", "run_id"),
|
||||
Index("ix_events_stage", "stage"),
|
||||
)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
||||
stage: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
|
||||
message: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
Reference in New Issue
Block a user