refactor: extract ensure_artifact into app/storage/artifacts.py
The artifact-upsert helper was duplicated four times (scanner.py, table_processor.py, figure_processor.py, pipeline.py) with slightly different signatures. Consolidates into a single keyword-only function keyed on (document_id, storage_key) - the identity the schema already enforces - so re-running the pipeline never creates duplicate rows. scanner / table_processor / figure_processor now import the shared helper directly. pipeline.py keeps a thin local wrapper to preserve the positional call sites at three artifact upsert points (OCR_PDF, MARKDOWN, DOCLING_JSON). Tests: 24 passed (5 health + 19 original). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,12 +23,12 @@ from sqlalchemy import select
|
||||
from app.db.models import (
|
||||
ArtifactType,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentStatus,
|
||||
ProcessingEvent,
|
||||
)
|
||||
from app.db.session import session_scope
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.artifacts import ensure_artifact
|
||||
from app.storage.local_paths import key_original_pdf
|
||||
from app.storage.minio_client import get_storage
|
||||
from app.utils.hashing import sha256_file
|
||||
@@ -121,13 +121,13 @@ def discover_documents(
|
||||
content_type="application/pdf",
|
||||
metadata={"sha256": sha, "original-name": path.name[:255]},
|
||||
)
|
||||
_ensure_artifact(
|
||||
ensure_artifact(
|
||||
db,
|
||||
doc.id,
|
||||
ArtifactType.ORIGINAL_PDF,
|
||||
storage.originals_bucket,
|
||||
key,
|
||||
sha,
|
||||
document_id=doc.id,
|
||||
artifact_type=ArtifactType.ORIGINAL_PDF,
|
||||
bucket=storage.originals_bucket,
|
||||
key=key,
|
||||
checksum=sha,
|
||||
)
|
||||
if doc.status == DocumentStatus.DISCOVERED:
|
||||
doc.status = DocumentStatus.STORED_ORIGINAL
|
||||
@@ -161,24 +161,3 @@ def discover_documents(
|
||||
)
|
||||
|
||||
|
||||
def _ensure_artifact(
|
||||
db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
|
||||
) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.artifact_type == artifact_type,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
checksum=checksum,
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user