Files
LegacyHUB/scripts/ingest_folder.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

61 lines
2.1 KiB
Python

"""Synchronous CLI ingestion: discover -> queue -> process inline.
Use ``--async`` to push tasks to Celery instead of running inline (default
inline mode is convenient for ad-hoc runs without a worker container).
"""
from __future__ import annotations
import sys
import uuid
from pathlib import Path
import click
from app.ingestion.scanner import discover_documents
from app.logging_config import configure_logging, get_logger
configure_logging()
logger = get_logger(__name__)
@click.command()
@click.option("--path", required=True, type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=Path))
@click.option("--recursive/--no-recursive", default=True)
@click.option("--force", is_flag=True, default=False, help="Re-process even if SHA already exists")
@click.option("--mode", type=click.Choice(["inline", "celery"]), default="inline")
def main(path: Path, recursive: bool, force: bool, mode: str) -> None:
run_id = uuid.uuid4()
discovered = queued = dups = invalid = 0
for record in discover_documents(path, recursive=recursive, force=force):
discovered += 1
if record.duplicate and not force:
dups += 1
continue
if not record.document_id:
invalid += 1
continue
if mode == "celery":
from app.workers.tasks import process_document
process_document.delay(str(record.document_id), str(run_id))
else:
from app.ingestion.pipeline import process_document_id
try:
result = process_document_id(record.document_id, run_id)
logger.info("ingest.cli.processed", path=str(record.path), result=result)
except Exception as exc: # noqa: BLE001
logger.exception("ingest.cli.failed", path=str(record.path), error=str(exc))
invalid += 1
continue
queued += 1
click.echo(
f"discovered={discovered} queued={queued} duplicates={dups} invalid={invalid} run={run_id}"
)
if __name__ == "__main__":
sys.exit(main(standalone_mode=True) or 0)