"""Persists Docling tables to PostgreSQL + MinIO.""" from __future__ import annotations import json import uuid from sqlalchemy import select from app.db.models import ArtifactType, DocumentArtifact, Table from app.ingestion.docling_extractor import ExtractedTable from app.logging_config import get_logger from app.storage.local_paths import key_table_json from app.storage.minio_client import MinioStorage logger = get_logger(__name__) def persist_tables( db, storage: MinioStorage, document_id: uuid.UUID, tables: list[ExtractedTable], page_id_by_number: dict[int, uuid.UUID], ) -> int: count = 0 for t in tables: existing = db.execute( select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index) ).scalar_one_or_none() if existing is None: existing = Table( document_id=document_id, page_id=page_id_by_number.get(t.page_number), page_number=t.page_number, table_index=t.table_index, ) db.add(existing) existing.markdown = t.markdown or "" existing.csv_text = t.csv_text existing.json_data = t.json_data existing.summary = _summary(t) db.flush() # Persist json blob to MinIO for large/inspectable copies. if t.json_data: key = key_table_json(document_id, t.table_index) storage.put_bytes( bucket=storage.derived_bucket, key=key, data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"), content_type="application/json", ) _ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number) count += 1 return count def _summary(t: ExtractedTable) -> str: md = t.markdown or "" n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2) return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)." def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None: existing = db.execute( select(DocumentArtifact).where( DocumentArtifact.document_id == document_id, DocumentArtifact.storage_key == key, ) ).scalar_one_or_none() if existing: return db.add( DocumentArtifact( document_id=document_id, artifact_type=artifact_type, storage_bucket=bucket, storage_key=key, page_number=page, ) )