LegacyHUB/app/utils/text_cleaning.py

"""Conservative OCR text cleaning.

Goals:
- Drop hyphenation across line breaks (``инвен-\\nтарный`` -> ``инвентарный``).
- Collapse runs of whitespace.
- Strip control chars.
- Preserve all non-letter characters that may carry meaning in legacy/technical
  documents: digits, punctuation, slashes, dashes, dots, parentheses, etc.

We do NOT lowercase, transliterate, or strip punctuation here. ``normalize_for_search``
produces a more aggressive form for indexing, but the original ``text`` is always
kept untouched for citation/display.
"""

from __future__ import annotations

import re
import unicodedata

_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
_SOFT_HYPHEN = ""
_MULTI_WS = re.compile(r"[ \t ]+")
_MULTI_NL = re.compile(r"\n{3,}")
_HYPHEN_LINEBREAK = re.compile(r"(\w)[-‐‑‒–]\n(\w)")
_TRAILING_WS = re.compile(r"[ \t]+\n")


def clean_ocr_text(text: str) -> str:
    if not text:
        return ""
    # Normalize unicode (NFC) to merge combining marks.
    text = unicodedata.normalize("NFC", text)
    text = text.replace(_SOFT_HYPHEN, "")
    text = _CONTROL_CHARS.sub("", text)
    text = _HYPHEN_LINEBREAK.sub(r"\1\2", text)
    text = _TRAILING_WS.sub("\n", text)
    text = _MULTI_WS.sub(" ", text)
    text = _MULTI_NL.sub("\n\n", text)
    return text.strip()


_PUNCT_RUN = re.compile(r"[^\w\s/\-.,№#:()\[\]]+", flags=re.UNICODE)
_WS_RUN = re.compile(r"\s+")


def normalize_for_search(text: str) -> str:
    """Lowercase + light normalization for full-text indexing.

    Preserves digits, alphanumerics, slashes, dashes, dots, commas, ``№``, ``#``,
    colons and brackets - all of which appear in document/serial/standard codes.
    """
    if not text:
        return ""
    text = clean_ocr_text(text)
    text = text.lower()
    text = _PUNCT_RUN.sub(" ", text)
    text = _WS_RUN.sub(" ", text)
    return text.strip()


def looks_garbled(text: str, threshold: float = 0.35) -> bool:
    """Heuristic: ratio of non-alphanumeric, non-whitespace chars."""
    if not text:
        return False
    total = len(text)
    if total < 20:
        return False
    bad = sum(1 for c in text if not (c.isalnum() or c.isspace() or c in ".,;:!?-/()[]№#"))
    return (bad / total) > threshold