"""Ingestion endpoints.""" from __future__ import annotations import uuid from pathlib import Path from fastapi import APIRouter, HTTPException from app.api.schemas import IngestFolderRequest, IngestFolderResponse from app.logging_config import get_logger logger = get_logger(__name__) router = APIRouter(prefix="/ingest", tags=["ingestion"]) @router.post("/folder", response_model=IngestFolderResponse) def ingest_folder(req: IngestFolderRequest) -> IngestFolderResponse: """Discover all PDFs under ``path`` and queue them for processing. The request returns immediately after the discovery pass. Per-document OCR / extraction / indexing happens asynchronously in Celery workers. """ folder = Path(req.path) if not folder.exists() or not folder.is_dir(): raise HTTPException(status_code=400, detail=f"Folder not found: {req.path}") # Lazy import - keeps module load light. from app.ingestion.scanner import discover_documents from app.workers.tasks import process_document run_id = uuid.uuid4() discovered, queued, dups, invalid = 0, 0, 0, 0 for record in discover_documents(folder, recursive=req.recursive, force=req.force): discovered += 1 if record.duplicate and not req.force: dups += 1 continue if not record.document_id: invalid += 1 continue process_document.delay(str(record.document_id), str(run_id)) queued += 1 logger.info( "ingest.folder.queued", path=str(folder), discovered=discovered, queued=queued, skipped_duplicates=dups, invalid=invalid, run_id=str(run_id), ) return IngestFolderResponse( run_id=run_id, discovered=discovered, queued=queued, skipped_duplicates=dups, invalid_files=invalid, )