chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
87
app/ingestion/ocr.py
Normal file
87
app/ingestion/ocr.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""OCRmyPDF integration with Tesseract.
|
||||
|
||||
We treat OCR as best-effort: if the input PDF already has a text layer (or OCR is
|
||||
disabled by config), we skip OCR and use the original PDF. On failure, the
|
||||
caller is expected to mark the document ``OCR_FAILED`` and continue without it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
from app.config import settings
|
||||
from app.logging_config import get_logger
|
||||
from app.utils.pdf import has_searchable_text
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OcrResult:
|
||||
output_path: Path
|
||||
skipped: bool
|
||||
reason: str
|
||||
languages: str
|
||||
|
||||
|
||||
def run_ocr(input_pdf: Path, output_pdf: Path, languages: str | None = None) -> OcrResult:
|
||||
"""Run OCRmyPDF.
|
||||
|
||||
- If ``OCR_ENABLED`` is false: copy the input as the output and skip.
|
||||
- If the input already has searchable text: skip OCR but still produce
|
||||
``output_pdf`` (a hard-link / copy to keep downstream code simple).
|
||||
- On unexpected exceptions: re-raise (caller handles status update).
|
||||
"""
|
||||
langs = languages or settings.ocr_languages
|
||||
|
||||
if not settings.ocr_enabled:
|
||||
return _skip(input_pdf, output_pdf, langs, "ocr_disabled")
|
||||
|
||||
if has_searchable_text(input_pdf):
|
||||
return _skip(input_pdf, output_pdf, langs, "already_searchable")
|
||||
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("ocr.start", input=str(input_pdf), output=str(output_pdf), languages=langs)
|
||||
|
||||
try:
|
||||
ocrmypdf.ocr(
|
||||
input_file=str(input_pdf),
|
||||
output_file=str(output_pdf),
|
||||
language=langs,
|
||||
skip_text=False,
|
||||
redo_ocr=False,
|
||||
force_ocr=False,
|
||||
deskew=settings.ocr_deskew,
|
||||
clean=settings.ocr_clean,
|
||||
optimize=settings.ocr_optimize,
|
||||
progress_bar=False,
|
||||
jobs=1,
|
||||
output_type="pdf",
|
||||
# tolerate already-OCR pages where present
|
||||
skip_big=200.0,
|
||||
)
|
||||
except ocrmypdf.exceptions.PriorOcrFoundError:
|
||||
logger.info("ocr.skip.prior_ocr", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "prior_ocr_found")
|
||||
except ocrmypdf.exceptions.DigitalSignatureError:
|
||||
logger.warning("ocr.skip.signed_pdf", input=str(input_pdf))
|
||||
return _skip(input_pdf, output_pdf, langs, "digitally_signed")
|
||||
except ocrmypdf.exceptions.EncryptedPdfError as exc:
|
||||
logger.warning("ocr.encrypted", input=str(input_pdf), error=str(exc))
|
||||
raise
|
||||
except ocrmypdf.exceptions.MissingDependencyError as exc:
|
||||
logger.error("ocr.missing_dependency", error=str(exc))
|
||||
raise
|
||||
|
||||
logger.info("ocr.done", output=str(output_pdf))
|
||||
return OcrResult(output_path=output_pdf, skipped=False, reason="ocr_completed", languages=langs)
|
||||
|
||||
|
||||
def _skip(input_pdf: Path, output_pdf: Path, langs: str, reason: str) -> OcrResult:
|
||||
output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not output_pdf.exists() or output_pdf.resolve() != input_pdf.resolve():
|
||||
output_pdf.write_bytes(input_pdf.read_bytes())
|
||||
return OcrResult(output_path=output_pdf, skipped=True, reason=reason, languages=langs)
|
||||
Reference in New Issue
Block a user