chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vadim Malanov
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions

87
app/ingestion/ocr.py Normal file
View File

@@ -0,0 +1,87 @@
"""OCRmyPDF integration with Tesseract.
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
disabled by config), we skip OCR and use the original PDF. On failure, the
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import ocrmypdf
from app.config import settings
from app.logging_config import get_logger
from app.utils.pdf import has_searchable_text
logger = get_logger(__name__)
@dataclass
class OcrResult:
output_path: Path
skipped: bool
reason: str
languages: str
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
"""Run OCRmyPDF.
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
- If the input already has searchable text: skip OCR but still produce
``output_pdf`` (a hard-link / copy to keep downstream code simple).
- On unexpected exceptions: re-raise (caller handles status update).
"""
langs = languages or settings.ocr_languages
if not settings.ocr_enabled:
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
if has_searchable_text(input_pdf):
return _skip(input_pdf, output_pdf, langs, "already_searchable")
output_pdf.parent.mkdir(parents=True, exist_ok=True)
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
try:
ocrmypdf.ocr(
input_file=str(input_pdf),
output_file=str(output_pdf),
language=langs,
skip_text=False,
redo_ocr=False,
force_ocr=False,
deskew=settings.ocr_deskew,
clean=settings.ocr_clean,
optimize=settings.ocr_optimize,
progress_bar=False,
jobs=1,
output_type="pdf",
# tolerate already-OCR pages where present
skip_big=200.0,
)
except ocrmypdf.exceptions.PriorOcrFoundError:
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
except ocrmypdf.exceptions.DigitalSignatureError:
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
except ocrmypdf.exceptions.EncryptedPdfError as exc:
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
raise
except ocrmypdf.exceptions.MissingDependencyError as exc:
logger.error("ocr.missing_dependency", error=str(exc))
raise
logger.info("ocr.done", output=str(output_pdf))
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
output_pdf.parent.mkdir(parents=True, exist_ok=True)
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
output_pdf.write_bytes(input_pdf.read_bytes())
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)