chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
84
app/ingestion/table_processor.py
Normal file
84
app/ingestion/table_processor.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Persists Docling tables to PostgreSQL + MinIO."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Table
|
||||
from app.ingestion.docling_extractor import ExtractedTable
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_table_json
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_tables(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
tables: list[ExtractedTable],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for t in tables:
|
||||
existing = db.execute(
|
||||
select(Table).where(Table.document_id == document_id, Table.table_index == t.table_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Table(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(t.page_number),
|
||||
page_number=t.page_number,
|
||||
table_index=t.table_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.markdown = t.markdown or ""
|
||||
existing.csv_text = t.csv_text
|
||||
existing.json_data = t.json_data
|
||||
existing.summary = _summary(t)
|
||||
db.flush()
|
||||
|
||||
# Persist json blob to MinIO for large/inspectable copies.
|
||||
if t.json_data:
|
||||
key = key_table_json(document_id, t.table_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=json.dumps(t.json_data, ensure_ascii=False).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
)
|
||||
_ensure_artifact(db, document_id, ArtifactType.TABLE_JSON, storage.derived_bucket, key, t.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _summary(t: ExtractedTable) -> str:
|
||||
md = t.markdown or ""
|
||||
n_rows = max(0, sum(1 for ln in md.splitlines() if ln.startswith("|")) - 2)
|
||||
return f"Table {t.table_index} on page {t.page_number} ({n_rows} rows)."
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user