Files
LegacyHUB/app/ingestion/table_processor.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

85 lines
2.6 KiB
Python

"""Persists Docling tables to PostgreSQL + MinIO."""
from __future__ import annotations
import json
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Table
from app.ingestion.docling_extractor import ExtractedTable
from app.logging_config import get_logger
from app.storage.local_paths import key_table_json
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_tables(
db,
storage: MinioStorage,
document_id: uuid.UUID,
tables: list[ExtractedTable],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for t in tables:
existing = db.execute(
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
).scalar_one_or_none()
if existing is None:
existing = Table(
document_id=document_id,
page_id=page_id_by_number.get(t.page_number),
page_number=t.page_number,
table_index=t.table_index,
)
db.add(existing)
existing.markdown = t.markdown or ""
existing.csv_text = t.csv_text
existing.json_data = t.json_data
existing.summary = _summary(t)
db.flush()
# Persist json blob to MinIO for large/inspectable copies.
if t.json_data:
key = key_table_json(document_id, t.table_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
content_type="application/json",
)
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
count += 1
return count
def _summary(t: ExtractedTable) -> str:
md = t.markdown or ""
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)