"""PDF inspection helpers - decide whether OCR is required.""" from __future__ import annotations from pathlib import Path import pikepdf from pdfminer.high_level import extract_text def page_count(path: Path | str) -> int: with pikepdf.open(str(path)) as pdf: return len(pdf.pages) def has_searchable_text(path: Path | str, sample_pages: int = 3, min_chars: int = 80) -> bool: """Cheap check: extract text from first ``sample_pages`` and require ``min_chars``. Returns False on any extraction error - safer to OCR than to skip. """ try: text = extract_text(str(path), maxpages=sample_pages) or "" except Exception: return False return len(text.strip()) >= min_chars def is_pdf(path: Path | str) -> bool: p = Path(path) if not p.is_file() or p.suffix.lower() != ".pdf": return False try: with open(p, "rb") as f: return f.read(5) == b"%PDF-" except OSError: return False