"""Shared ``document_artifacts`` upsert helper. Single source of truth used by the scanner, the per-document pipeline, and the table / figure processors. Each caller previously carried its own copy with slightly different signatures; this module replaces them so that the artifact row schema is enforced in one place. """ from __future__ import annotations import uuid from sqlalchemy import select from sqlalchemy.orm import Session from app.db.models import DocumentArtifact def ensure_artifact( db: Session, *, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page_number: int | None = None, checksum: str | None = None, ) -> DocumentArtifact: """Insert a ``DocumentArtifact`` row if none exists with the same key. Identity is ``(document_id, storage_key)``. Re-running the pipeline never duplicates artifact rows; metadata fields are not updated in place because derived bytes are versioned by their storage key. """ existing = db.execute( select(DocumentArtifact).where( DocumentArtifact.document_id == document_id, DocumentArtifact.storage_key == key, ) ).scalar_one_or_none() if existing is not None: return existing artifact = DocumentArtifact( document_id=document_id, artifact_type=artifact_type, storage_bucket=bucket, storage_key=key, page_number=page_number, checksum=checksum, ) db.add(artifact) return artifact