The artifact-upsert helper was duplicated four times (scanner.py, table_processor.py, figure_processor.py, pipeline.py) with slightly different signatures. Consolidates into a single keyword-only function keyed on (document_id, storage_key) - the identity the schema already enforces - so re-running the pipeline never creates duplicate rows. scanner / table_processor / figure_processor now import the shared helper directly. pipeline.py keeps a thin local wrapper to preserve the positional call sites at three artifact upsert points (OCR_PDF, MARKDOWN, DOCLING_JSON). Tests: 24 passed (5 health + 19 original). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
"""Persists Docling tables to PostgreSQL + MinIO."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import uuid
|
|
|
|
from sqlalchemy import select
|
|
|
|
from app.db.models import ArtifactType, Table
|
|
from app.ingestion.docling_extractor import ExtractedTable
|
|
from app.logging_config import get_logger
|
|
from app.storage.artifacts import ensure_artifact
|
|
from app.storage.local_paths import key_table_json
|
|
from app.storage.minio_client import MinioStorage
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def persist_tables(
|
|
db,
|
|
storage: MinioStorage,
|
|
document_id: uuid.UUID,
|
|
tables: list[ExtractedTable],
|
|
page_id_by_number: dict[int, uuid.UUID],
|
|
) -> int:
|
|
count = 0
|
|
for t in tables:
|
|
existing = db.execute(
|
|
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
|
|
).scalar_one_or_none()
|
|
if existing is None:
|
|
existing = Table(
|
|
document_id=document_id,
|
|
page_id=page_id_by_number.get(t.page_number),
|
|
page_number=t.page_number,
|
|
table_index=t.table_index,
|
|
)
|
|
db.add(existing)
|
|
|
|
existing.markdown = t.markdown or ""
|
|
existing.csv_text = t.csv_text
|
|
existing.json_data = t.json_data
|
|
existing.summary = _summary(t)
|
|
db.flush()
|
|
|
|
# Persist json blob to MinIO for large/inspectable copies.
|
|
if t.json_data:
|
|
key = key_table_json(document_id, t.table_index)
|
|
storage.put_bytes(
|
|
bucket=storage.derived_bucket,
|
|
key=key,
|
|
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
|
|
content_type="application/json",
|
|
)
|
|
ensure_artifact(
|
|
db,
|
|
document_id=document_id,
|
|
artifact_type=ArtifactType.TABLE_JSON,
|
|
bucket=storage.derived_bucket,
|
|
key=key,
|
|
page_number=t.page_number,
|
|
)
|
|
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def _summary(t: ExtractedTable) -> str:
|
|
md = t.markdown or ""
|
|
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
|
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
|