The artifact-upsert helper was duplicated four times (scanner.py, table_processor.py, figure_processor.py, pipeline.py) with slightly different signatures. Consolidates into a single keyword-only function keyed on (document_id, storage_key) - the identity the schema already enforces - so re-running the pipeline never creates duplicate rows. scanner / table_processor / figure_processor now import the shared helper directly. pipeline.py keeps a thin local wrapper to preserve the positional call sites at three artifact upsert points (OCR_PDF, MARKDOWN, DOCLING_JSON). Tests: 24 passed (5 health + 19 original). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
"""Shared ``document_artifacts`` upsert helper.
|
|
|
|
Single source of truth used by the scanner, the per-document pipeline, and the
|
|
table / figure processors. Each caller previously carried its own copy with
|
|
slightly different signatures; this module replaces them so that the artifact
|
|
row schema is enforced in one place.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.db.models import DocumentArtifact
|
|
|
|
|
|
def ensure_artifact(
|
|
db: Session,
|
|
*,
|
|
document_id: uuid.UUID,
|
|
artifact_type: str,
|
|
bucket: str,
|
|
key: str,
|
|
page_number: int | None = None,
|
|
checksum: str | None = None,
|
|
) -> DocumentArtifact:
|
|
"""Insert a ``DocumentArtifact`` row if none exists with the same key.
|
|
|
|
Identity is ``(document_id, storage_key)``. Re-running the pipeline never
|
|
duplicates artifact rows; metadata fields are not updated in place because
|
|
derived bytes are versioned by their storage key.
|
|
"""
|
|
existing = db.execute(
|
|
select(DocumentArtifact).where(
|
|
DocumentArtifact.document_id == document_id,
|
|
DocumentArtifact.storage_key == key,
|
|
)
|
|
).scalar_one_or_none()
|
|
if existing is not None:
|
|
return existing
|
|
|
|
artifact = DocumentArtifact(
|
|
document_id=document_id,
|
|
artifact_type=artifact_type,
|
|
storage_bucket=bucket,
|
|
storage_key=key,
|
|
page_number=page_number,
|
|
checksum=checksum,
|
|
)
|
|
db.add(artifact)
|
|
return artifact
|