refactor: extract ensure_artifact into app/storage/artifacts.py
The artifact-upsert helper was duplicated four times (scanner.py, table_processor.py, figure_processor.py, pipeline.py) with slightly different signatures. Consolidates into a single keyword-only function keyed on (document_id, storage_key) - the identity the schema already enforces - so re-running the pipeline never creates duplicate rows. scanner / table_processor / figure_processor now import the shared helper directly. pipeline.py keeps a thin local wrapper to preserve the positional call sites at three artifact upsert points (OCR_PDF, MARKDOWN, DOCLING_JSON). Tests: 24 passed (5 health + 19 original). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
53
app/storage/artifacts.py
Normal file
53
app/storage/artifacts.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Shared ``document_artifacts`` upsert helper.
|
||||
|
||||
Single source of truth used by the scanner, the per-document pipeline, and the
|
||||
table / figure processors. Each caller previously carried its own copy with
|
||||
slightly different signatures; this module replaces them so that the artifact
|
||||
row schema is enforced in one place.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.db.models import DocumentArtifact
|
||||
|
||||
|
||||
def ensure_artifact(
|
||||
db: Session,
|
||||
*,
|
||||
document_id: uuid.UUID,
|
||||
artifact_type: str,
|
||||
bucket: str,
|
||||
key: str,
|
||||
page_number: int | None = None,
|
||||
checksum: str | None = None,
|
||||
) -> DocumentArtifact:
|
||||
"""Insert a ``DocumentArtifact`` row if none exists with the same key.
|
||||
|
||||
Identity is ``(document_id, storage_key)``. Re-running the pipeline never
|
||||
duplicates artifact rows; metadata fields are not updated in place because
|
||||
derived bytes are versioned by their storage key.
|
||||
"""
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing is not None:
|
||||
return existing
|
||||
|
||||
artifact = DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page_number,
|
||||
checksum=checksum,
|
||||
)
|
||||
db.add(artifact)
|
||||
return artifact
|
||||
Reference in New Issue
Block a user