Files
LegacyHUB/app/api/routes_ingestion.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

64 lines
1.8 KiB
Python

"""Ingestion endpoints."""
from __future__ import annotations
import uuid
from pathlib import Path
from fastapi import APIRouter, HTTPException
from app.api.schemas import IngestFolderRequest, IngestFolderResponse
from app.logging_config import get_logger
logger = get_logger(__name__)
router = APIRouter(prefix="/ingest", tags=["ingestion"])
@router.post("/folder", response_model=IngestFolderResponse)
def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse:
"""Discover all PDFs under ``path`` and queue them for processing.
The request returns immediately after the discovery pass. Per-document
OCR / extraction / indexing happens asynchronously in Celery workers.
"""
folder = Path(req.path)
if not folder.exists() or not folder.is_dir():
raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}")
# Lazy import - keeps module load light.
from app.ingestion.scanner import discover_documents
from app.workers.tasks import process_document
run_id = uuid.uuid4()
discovered, queued, dups, invalid = 0, 0, 0, 0
for record in discover_documents(folder, recursive=req.recursive, force=req.force):
discovered += 1
if record.duplicate and not req.force:
dups += 1
continue
if not record.document_id:
invalid += 1
continue
process_document.delay(str(record.document_id), str(run_id))
queued += 1
logger.info(
"ingest.folder.queued",
path=str(folder),
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid=invalid,
run_id=str(run_id),
)
return IngestFolderResponse(
run_id=run_id,
discovered=discovered,
queued=queued,
skipped_duplicates=dups,
invalid_files=invalid,
)