"""Conservative OCR text cleaning. Goals: - Drop hyphenation across line breaks (``инвен-\\nтарный`` -> ``инвентарный``). - Collapse runs of whitespace. - Strip control chars. - Preserve all non-letter characters that may carry meaning in legacy/technical documents: digits, punctuation, slashes, dashes, dots, parentheses, etc. We do NOT lowercase, transliterate, or strip punctuation here. ``normalize_for_search`` produces a more aggressive form for indexing, but the original ``text`` is always kept untouched for citation/display. """ from __future__ import annotations import re import unicodedata _CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]") _SOFT_HYPHEN = "­" _MULTI_WS = re.compile(r"[ \t ]+") _MULTI_NL = re.compile(r"\n{3,}") _HYPHEN_LINEBREAK = re.compile(r"(\w)[-‐‑‒–]\n(\w)") _TRAILING_WS = re.compile(r"[ \t]+\n") def clean_ocr_text(text: str) -> str: if not text: return "" # Normalize unicode (NFC) to merge combining marks. text = unicodedata.normalize("NFC", text) text = text.replace(_SOFT_HYPHEN, "") text = _CONTROL_CHARS.sub("", text) text = _HYPHEN_LINEBREAK.sub(r"\1\2", text) text = _TRAILING_WS.sub("\n", text) text = _MULTI_WS.sub(" ", text) text = _MULTI_NL.sub("\n\n", text) return text.strip() _PUNCT_RUN = re.compile(r"[^\w\s/\-.,№#:()\[\]]+", flags=re.UNICODE) _WS_RUN = re.compile(r"\s+") def normalize_for_search(text: str) -> str: """Lowercase + light normalization for full-text indexing. Preserves digits, alphanumerics, slashes, dashes, dots, commas, ``№``, ``#``, colons and brackets - all of which appear in document/serial/standard codes. """ if not text: return "" text = clean_ocr_text(text) text = text.lower() text = _PUNCT_RUN.sub(" ", text) text = _WS_RUN.sub(" ", text) return text.strip() def looks_garbled(text: str, threshold: float = 0.35) -> bool: """Heuristic: ratio of non-alphanumeric, non-whitespace chars.""" if not text: return False total = len(text) if total < 20: return False bad = sum(1 for c in text if not (c.isalnum() or c.isspace() or c in ".,;:!?-/()[]№#")) return (bad / total) > threshold