LegacyHUB/app/storage/artifacts.py

"""Shared ``document_artifacts`` upsert helper.

Single source of truth used by the scanner, the per-document pipeline, and the
table / figure processors. Each caller previously carried its own copy with
slightly different signatures; this module replaces them so that the artifact
row schema is enforced in one place.
"""

from __future__ import annotations

import uuid

from sqlalchemy import select
from sqlalchemy.orm import Session

from app.db.models import DocumentArtifact


def ensure_artifact(
    db: Session,
    *,
    document_id: uuid.UUID,
    artifact_type: str,
    bucket: str,
    key: str,
    page_number: int | None = None,
    checksum: str | None = None,
) -> DocumentArtifact:
    """Insert a ``DocumentArtifact`` row if none exists with the same key.

    Identity is ``(document_id, storage_key)``. Re-running the pipeline never
    duplicates artifact rows; metadata fields are not updated in place because
    derived bytes are versioned by their storage key.
    """
    existing = db.execute(
        select(DocumentArtifact).where(
            DocumentArtifact.document_id == document_id,
            DocumentArtifact.storage_key == key,
        )
    ).scalar_one_or_none()
    if existing is not None:
        return existing

    artifact = DocumentArtifact(
        document_id=document_id,
        artifact_type=artifact_type,
        storage_bucket=bucket,
        storage_key=key,
        page_number=page_number,
        checksum=checksum,
    )
    db.add(artifact)
    return artifact