Files
LegacyHUB/app/ingestion/table_processor.py
Vadim Malanov a375ca55b9 refactor: extract ensure_artifact into app/storage/artifacts.py
The artifact-upsert helper was duplicated four times (scanner.py,
table_processor.py, figure_processor.py, pipeline.py) with slightly
different signatures. Consolidates into a single keyword-only function
keyed on (document_id, storage_key) - the identity the schema already
enforces - so re-running the pipeline never creates duplicate rows.

scanner / table_processor / figure_processor now import the shared
helper directly. pipeline.py keeps a thin local wrapper to preserve
the positional call sites at three artifact upsert points (OCR_PDF,
MARKDOWN, DOCLING_JSON).

Tests: 24 passed (5 health + 19 original).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:51:54 +03:00

73 lines
2.2 KiB
Python

"""Persists Docling tables to PostgreSQL + MinIO."""
from __future__ import annotations
import json
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, Table
from app.ingestion.docling_extractor import ExtractedTable
from app.logging_config import get_logger
from app.storage.artifacts import ensure_artifact
from app.storage.local_paths import key_table_json
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_tables(
db,
storage: MinioStorage,
document_id: uuid.UUID,
tables: list[ExtractedTable],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for t in tables:
existing = db.execute(
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
).scalar_one_or_none()
if existing is None:
existing = Table(
document_id=document_id,
page_id=page_id_by_number.get(t.page_number),
page_number=t.page_number,
table_index=t.table_index,
)
db.add(existing)
existing.markdown = t.markdown or ""
existing.csv_text = t.csv_text
existing.json_data = t.json_data
existing.summary = _summary(t)
db.flush()
# Persist json blob to MinIO for large/inspectable copies.
if t.json_data:
key = key_table_json(document_id, t.table_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
ensure_artifact(
db,
document_id=document_id,
artifact_type=ArtifactType.TABLE_JSON,
bucket=storage.derived_bucket,
key=key,
page_number=t.page_number,
)
count += 1
return count
def _summary(t: ExtractedTable) -> str:
md = t.markdown or ""
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."