refactor: extract ensure_artifact into app/storage/artifacts.py

The artifact-upsert helper was duplicated four times (scanner.py, table_processor.py, figure_processor.py, pipeline.py) with slightly different signatures. Consolidates into a single keyword-only function keyed on (document_id, storage_key) - the identity the schema already enforces - so re-running the pipeline never creates duplicate rows. scanner / table_processor / figure_processor now import the shared helper directly. pipeline.py keeps a thin local wrapper to preserve the positional call sites at three artifact upsert points (OCR_PDF, MARKDOWN, DOCLING_JSON). Tests: 24 passed (5 health + 19 original). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:51:54 +03:00
parent cd9977f8c3
commit a375ca55b9
6 changed files with 91 additions and 88 deletions
--- a/app/ingestion/scanner.py
+++ b/app/ingestion/scanner.py
@@ -23,12 +23,12 @@ from sqlalchemy import select
 from app.db.models import (
    ArtifactType,
    Document,
-    DocumentArtifact,
    DocumentStatus,
    ProcessingEvent,
 )
 from app.db.session import session_scope
 from app.logging_config import get_logger
+from app.storage.artifacts import ensure_artifact
 from app.storage.local_paths import key_original_pdf
 from app.storage.minio_client import get_storage
 from app.utils.hashing import sha256_file
@@ -121,13 +121,13 @@ def discover_documents(
                        content_type="application/pdf",
                        metadata={"sha256": sha, "original-name": path.name[:255]},
                    )
-                _ensure_artifact(
+                ensure_artifact(
                    db,
-                    doc.id,
-                    ArtifactType.ORIGINAL_PDF,
-                    storage.originals_bucket,
-                    key,
-                    sha,
+                    document_id=doc.id,
+                    artifact_type=ArtifactType.ORIGINAL_PDF,
+                    bucket=storage.originals_bucket,
+                    key=key,
+                    checksum=sha,
                )
                if doc.status == DocumentStatus.DISCOVERED:
                    doc.status = DocumentStatus.STORED_ORIGINAL
@@ -161,24 +161,3 @@ def discover_documents(
            )


-def _ensure_artifact(
-    db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, checksum: str | None
-) -> None:
-    existing = db.execute(
-        select(DocumentArtifact).where(
-            DocumentArtifact.document_id == document_id,
-            DocumentArtifact.artifact_type == artifact_type,
-            DocumentArtifact.storage_key == key,
-        )
-    ).scalar_one_or_none()
-    if existing:
-        return
-    db.add(
-        DocumentArtifact(
-            document_id=document_id,
-            artifact_type=artifact_type,
-            storage_bucket=bucket,
-            storage_key=key,
-            checksum=checksum,
-        )
-    )