"""OCRmyPDF integration with Tesseract. We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is disabled by config), we skip OCR and use the original PDF. On failure, the caller is expected to mark the document ``OCR_FAILED`` and continue without it. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path import ocrmypdf from app.config import settings from app.logging_config import get_logger from app.utils.pdf import has_searchable_text logger = get_logger(__name__) @dataclass class OcrResult: output_path: Path skipped: bool reason: str languages: str def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult: """Run OCRmyPDF. - If ``OCR_ENABLED`` is false: copy the input as the output and skip. - If the input already has searchable text: skip OCR but still produce ``output_pdf`` (a hard-link / copy to keep downstream code simple). - On unexpected exceptions: re-raise (caller handles status update). """ langs = languages or settings.ocr_languages if not settings.ocr_enabled: return _skip(input_pdf, output_pdf, langs, "ocr_disabled") if has_searchable_text(input_pdf): return _skip(input_pdf, output_pdf, langs, "already_searchable") output_pdf.parent.mkdir(parents=True, exist_ok=True) logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs) try: ocrmypdf.ocr( input_file=str(input_pdf), output_file=str(output_pdf), language=langs, skip_text=False, redo_ocr=False, force_ocr=False, deskew=settings.ocr_deskew, clean=settings.ocr_clean, optimize=settings.ocr_optimize, progress_bar=False, jobs=1, output_type="pdf", # tolerate already-OCR pages where present skip_big=200.0, ) except ocrmypdf.exceptions.PriorOcrFoundError: logger.info("ocr.skip.prior_ocr", input=str(input_pdf)) return _skip(input_pdf, output_pdf, langs, "prior_ocr_found") except ocrmypdf.exceptions.DigitalSignatureError: logger.warning("ocr.skip.signed_pdf", input=str(input_pdf)) return _skip(input_pdf, output_pdf, langs, "digitally_signed") except ocrmypdf.exceptions.EncryptedPdfError as exc: logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc)) raise except ocrmypdf.exceptions.MissingDependencyError as exc: logger.error("ocr.missing_dependency", error=str(exc)) raise logger.info("ocr.done", output=str(output_pdf)) return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs) def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult: output_pdf.parent.mkdir(parents=True, exist_ok=True) if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve(): output_pdf.write_bytes(input_pdf.read_bytes()) return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)