chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

View File

@@ -0,0 +1,78 @@
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
from __future__ import annotations
import uuid
from sqlalchemy import select
from app.db.models import ArtifactType, DocumentArtifact, Figure
from app.ingestion.docling_extractor import ExtractedFigure
from app.logging_config import get_logger
from app.storage.local_paths import key_figure_crop
from app.storage.minio_client import MinioStorage
logger = get_logger(__name__)
def persist_figures(
db,
storage: MinioStorage,
document_id: uuid.UUID,
figures: list[ExtractedFigure],
page_id_by_number: dict[int, uuid.UUID],
) -> int:
count = 0
for f in figures:
existing = db.execute(
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
).scalar_one_or_none()
if existing is None:
existing = Figure(
document_id=document_id,
page_id=page_id_by_number.get(f.page_number),
page_number=f.page_number,
figure_index=f.figure_index,
)
db.add(existing)
existing.caption = f.caption
existing.description = (
f"Figure detected on page {f.page_number}." if not f.caption else
f"Figure on page {f.page_number}. Caption: {f.caption}"
)
if f.image_bytes:
key = key_figure_crop(document_id, f.page_number, f.figure_index)
storage.put_bytes(
bucket=storage.derived_bucket,
key=key,
data=f.image_bytes,
content_type=f"image/{f.image_ext}",
)
existing.storage_bucket = storage.derived_bucket
existing.storage_key = key
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
count += 1
return count
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
existing = db.execute(
select(DocumentArtifact).where(
DocumentArtifact.document_id == document_id,
DocumentArtifact.storage_key == key,
)
).scalar_one_or_none()
if existing:
return
db.add(
DocumentArtifact(
document_id=document_id,
artifact_type=artifact_type,
storage_bucket=bucket,
storage_key=key,
page_number=page,
)
)