Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
267 lines
11 KiB
Python
267 lines
11 KiB
Python
"""SQLAlchemy ORM models for LegacyHUB."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
from sqlalchemy import (
|
|
BigInteger,
|
|
Boolean,
|
|
DateTime,
|
|
Float,
|
|
ForeignKey,
|
|
Index,
|
|
Integer,
|
|
String,
|
|
Text,
|
|
UniqueConstraint,
|
|
func,
|
|
)
|
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
|
|
|
|
class Base(DeclarativeBase):
|
|
pass
|
|
|
|
|
|
# ---- Status / type literals (kept as plain strings to avoid PG enum churn) ----
|
|
|
|
class DocumentStatus:
|
|
DISCOVERED = "DISCOVERED"
|
|
STORED_ORIGINAL = "STORED_ORIGINAL"
|
|
OCR_STARTED = "OCR_STARTED"
|
|
OCR_COMPLETED = "OCR_COMPLETED"
|
|
OCR_FAILED = "OCR_FAILED"
|
|
EXTRACTION_STARTED = "EXTRACTION_STARTED"
|
|
EXTRACTION_COMPLETED = "EXTRACTION_COMPLETED"
|
|
EXTRACTION_FAILED = "EXTRACTION_FAILED"
|
|
CHUNKING_COMPLETED = "CHUNKING_COMPLETED"
|
|
INDEXING_COMPLETED = "INDEXING_COMPLETED"
|
|
FAILED = "FAILED"
|
|
|
|
|
|
class ArtifactType:
|
|
ORIGINAL_PDF = "original_pdf"
|
|
OCR_PDF = "ocr_pdf"
|
|
DOCLING_JSON = "docling_json"
|
|
MARKDOWN = "markdown"
|
|
PAGE_IMAGE = "page_image"
|
|
FIGURE_CROP = "figure_crop"
|
|
TABLE_JSON = "table_json"
|
|
|
|
|
|
class BlockType:
|
|
TITLE = "title"
|
|
HEADING = "heading"
|
|
PARAGRAPH = "paragraph"
|
|
LIST = "list"
|
|
TABLE = "table"
|
|
FIGURE_CAPTION = "figure_caption"
|
|
FIGURE_DESCRIPTION = "figure_description"
|
|
HANDWRITING = "handwriting"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
# ---- Tables ----
|
|
|
|
class Document(Base):
|
|
__tablename__ = "documents"
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
|
original_file_name: Mapped[str] = mapped_column(Text, nullable=False)
|
|
sha256: Mapped[str] = mapped_column(String(64), nullable=False, unique=True, index=True)
|
|
file_size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
|
mime_type: Mapped[str] = mapped_column(Text, nullable=False, default="application/pdf")
|
|
language_hint: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
status: Mapped[str] = mapped_column(
|
|
String(64), nullable=False, default=DocumentStatus.DISCOVERED, index=True
|
|
)
|
|
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
updated_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
|
|
)
|
|
|
|
artifacts: Mapped[list[DocumentArtifact]] = relationship(
|
|
back_populates="document", cascade="all, delete-orphan"
|
|
)
|
|
pages: Mapped[list[Page]] = relationship(
|
|
back_populates="document", cascade="all, delete-orphan"
|
|
)
|
|
chunks: Mapped[list[Chunk]] = relationship(
|
|
back_populates="document", cascade="all, delete-orphan"
|
|
)
|
|
|
|
|
|
class DocumentArtifact(Base):
|
|
__tablename__ = "document_artifacts"
|
|
__table_args__ = (
|
|
Index("ix_artifacts_doc_type", "document_id", "artifact_type"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
artifact_type: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
storage_bucket: Mapped[str] = mapped_column(Text, nullable=False)
|
|
storage_key: Mapped[str] = mapped_column(Text, nullable=False)
|
|
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
|
checksum: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
|
|
document: Mapped[Document] = relationship(back_populates="artifacts")
|
|
|
|
|
|
class Page(Base):
|
|
__tablename__ = "pages"
|
|
__table_args__ = (
|
|
UniqueConstraint("document_id", "page_number", name="uq_pages_doc_page"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
|
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
|
has_tables: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
|
has_figures: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
|
has_handwriting: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
|
|
document: Mapped[Document] = relationship(back_populates="pages")
|
|
chunks: Mapped[list[Chunk]] = relationship(back_populates="page")
|
|
|
|
|
|
class Chunk(Base):
|
|
__tablename__ = "chunks"
|
|
__table_args__ = (
|
|
UniqueConstraint("document_id", "chunk_index", name="uq_chunks_doc_idx"),
|
|
Index("ix_chunks_doc_page", "document_id", "page_number"),
|
|
Index("ix_chunks_block_type", "block_type"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
|
)
|
|
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
block_id: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
block_type: Mapped[str] = mapped_column(String(32), nullable=False, default=BlockType.PARAGRAPH)
|
|
text: Mapped[str] = mapped_column(Text, nullable=False)
|
|
normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
|
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
|
ocr_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
|
|
quality_flags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
|
chunk_metadata: Mapped[dict[str, Any]] = mapped_column(
|
|
"metadata", JSONB, nullable=False, default=dict
|
|
)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
|
|
document: Mapped[Document] = relationship(back_populates="chunks")
|
|
page: Mapped[Page | None] = relationship(back_populates="chunks")
|
|
|
|
|
|
class Table(Base):
|
|
__tablename__ = "tables"
|
|
__table_args__ = (
|
|
UniqueConstraint("document_id", "table_index", name="uq_tables_doc_idx"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
|
)
|
|
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
table_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
markdown: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
|
csv_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
json_data: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
|
|
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
|
|
|
|
class Figure(Base):
|
|
__tablename__ = "figures"
|
|
__table_args__ = (
|
|
UniqueConstraint("document_id", "figure_index", name="uq_figures_doc_idx"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
|
|
)
|
|
page_id: Mapped[uuid.UUID | None] = mapped_column(
|
|
UUID(as_uuid=True), ForeignKey("pages.id", ondelete="SET NULL"), nullable=True
|
|
)
|
|
page_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
figure_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
|
caption: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
storage_bucket: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
storage_key: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
|
|
|
|
class IngestionRun(Base):
|
|
__tablename__ = "ingestion_runs"
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
started_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|
|
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
|
status: Mapped[str] = mapped_column(String(32), nullable=False, default="RUNNING")
|
|
source_folder: Mapped[str] = mapped_column(Text, nullable=False)
|
|
total_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
processed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
failed_files: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
|
run_metadata: Mapped[dict[str, Any]] = mapped_column(
|
|
"metadata", JSONB, nullable=False, default=dict
|
|
)
|
|
|
|
|
|
class ProcessingEvent(Base):
|
|
__tablename__ = "processing_events"
|
|
__table_args__ = (
|
|
Index("ix_events_doc", "document_id"),
|
|
Index("ix_events_run", "run_id"),
|
|
Index("ix_events_stage", "stage"),
|
|
)
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
run_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
|
document_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
|
|
stage: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
level: Mapped[str] = mapped_column(String(16), nullable=False, default="INFO")
|
|
message: Mapped[str] = mapped_column(Text, nullable=False)
|
|
data: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True), server_default=func.now(), nullable=False
|
|
)
|