chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
78
app/ingestion/figure_processor.py
Normal file
78
app/ingestion/figure_processor.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Persists Docling figures to PostgreSQL + MinIO (caption + optional crop)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db.models import ArtifactType, DocumentArtifact, Figure
|
||||
from app.ingestion.docling_extractor import ExtractedFigure
|
||||
from app.logging_config import get_logger
|
||||
from app.storage.local_paths import key_figure_crop
|
||||
from app.storage.minio_client import MinioStorage
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def persist_figures(
|
||||
db,
|
||||
storage: MinioStorage,
|
||||
document_id: uuid.UUID,
|
||||
figures: list[ExtractedFigure],
|
||||
page_id_by_number: dict[int, uuid.UUID],
|
||||
) -> int:
|
||||
count = 0
|
||||
for f in figures:
|
||||
existing = db.execute(
|
||||
select(Figure).where(Figure.document_id == document_id, Figure.figure_index == f.figure_index)
|
||||
).scalar_one_or_none()
|
||||
if existing is None:
|
||||
existing = Figure(
|
||||
document_id=document_id,
|
||||
page_id=page_id_by_number.get(f.page_number),
|
||||
page_number=f.page_number,
|
||||
figure_index=f.figure_index,
|
||||
)
|
||||
db.add(existing)
|
||||
|
||||
existing.caption = f.caption
|
||||
existing.description = (
|
||||
f"Figure detected on page {f.page_number}." if not f.caption else
|
||||
f"Figure on page {f.page_number}. Caption: {f.caption}"
|
||||
)
|
||||
|
||||
if f.image_bytes:
|
||||
key = key_figure_crop(document_id, f.page_number, f.figure_index)
|
||||
storage.put_bytes(
|
||||
bucket=storage.derived_bucket,
|
||||
key=key,
|
||||
data=f.image_bytes,
|
||||
content_type=f"image/{f.image_ext}",
|
||||
)
|
||||
existing.storage_bucket = storage.derived_bucket
|
||||
existing.storage_key = key
|
||||
_ensure_artifact(db, document_id, ArtifactType.FIGURE_CROP, storage.derived_bucket, key, f.page_number)
|
||||
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _ensure_artifact(db, document_id: uuid.UUID, artifact_type: str, bucket: str, key: str, page: int | None) -> None:
|
||||
existing = db.execute(
|
||||
select(DocumentArtifact).where(
|
||||
DocumentArtifact.document_id == document_id,
|
||||
DocumentArtifact.storage_key == key,
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if existing:
|
||||
return
|
||||
db.add(
|
||||
DocumentArtifact(
|
||||
document_id=document_id,
|
||||
artifact_type=artifact_type,
|
||||
storage_bucket=bucket,
|
||||
storage_key=key,
|
||||
page_number=page,
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user