LegacyHUB/app/utils/pdf.py

"""PDF inspection helpers - decide whether OCR is required."""

from __future__ import annotations

from pathlib import Path

import pikepdf
from pdfminer.high_level import extract_text


def page_count(path: Path | str) -> int:
    with pikepdf.open(str(path)) as pdf:
        return len(pdf.pages)


def has_searchable_text(path: Path | str, sample_pages: int = 3, min_chars: int = 80) -> bool:
    """Cheap check: extract text from first ``sample_pages`` and require ``min_chars``.

    Returns False on any extraction error - safer to OCR than to skip.
    """
    try:
        text = extract_text(str(path), maxpages=sample_pages) or ""
    except Exception:
        return False
    return len(text.strip()) >= min_chars


def is_pdf(path: Path | str) -> bool:
    p = Path(path)
    if not p.is_file() or p.suffix.lower() != ".pdf":
        return False
    try:
        with open(p, "rb") as f:
            return f.read(5) == b"%PDF-"
    except OSError:
        return False