chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions
--- a/app/ingestion/ocr.py
+++ b/app/ingestion/ocr.py
@@ -0,0 +1,87 @@
+"""OCRmyPDF integration with Tesseract.
+
+We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
+disabled by config), we skip OCR and use the original PDF. On failure, the
+caller is expected to mark the document ``OCR_FAILED`` and continue without it.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import ocrmypdf
+
+from app.config import settings
+from app.logging_config import get_logger
+from app.utils.pdf import has_searchable_text
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class OcrResult:
+    output_path: Path
+    skipped: bool
+    reason: str
+    languages: str
+
+
+def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
+    """Run OCRmyPDF.
+
+    - If ``OCR_ENABLED`` is false: copy the input as the output and skip.
+    - If the input already has searchable text: skip OCR but still produce
+      ``output_pdf`` (a hard-link / copy to keep downstream code simple).
+    - On unexpected exceptions: re-raise (caller handles status update).
+    """
+    langs = languages or settings.ocr_languages
+
+    if not settings.ocr_enabled:
+        return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
+
+    if has_searchable_text(input_pdf):
+        return _skip(input_pdf, output_pdf, langs, "already_searchable")
+
+    output_pdf.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
+
+    try:
+        ocrmypdf.ocr(
+            input_file=str(input_pdf),
+            output_file=str(output_pdf),
+            language=langs,
+            skip_text=False,
+            redo_ocr=False,
+            force_ocr=False,
+            deskew=settings.ocr_deskew,
+            clean=settings.ocr_clean,
+            optimize=settings.ocr_optimize,
+            progress_bar=False,
+            jobs=1,
+            output_type="pdf",
+            # tolerate already-OCR pages where present
+            skip_big=200.0,
+        )
+    except ocrmypdf.exceptions.PriorOcrFoundError:
+        logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
+        return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
+    except ocrmypdf.exceptions.DigitalSignatureError:
+        logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
+        return _skip(input_pdf, output_pdf, langs, "digitally_signed")
+    except ocrmypdf.exceptions.EncryptedPdfError as exc:
+        logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
+        raise
+    except ocrmypdf.exceptions.MissingDependencyError as exc:
+        logger.error("ocr.missing_dependency", error=str(exc))
+        raise
+
+    logger.info("ocr.done", output=str(output_pdf))
+    return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
+
+
+def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
+    output_pdf.parent.mkdir(parents=True, exist_ok=True)
+    if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
+        output_pdf.write_bytes(input_pdf.read_bytes())
+    return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)