chore: bootstrap repository with governance docs

Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line endings), AGENTS.md (entry points, stack, discovery order, baseline checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion, failures, rollback, scaling notes), .env.prod.example with rotated credential placeholders, and dev-only warnings on .env.example. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00
commit 7f72171572
157 changed files with 11298 additions and 0 deletions
--- a/tests/test_chunker.py
+++ b/tests/test_chunker.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from app.ingestion.chunker import chunk_extraction
+from app.ingestion.docling_extractor import (
+    ExtractedBlock,
+    ExtractedFigure,
+    ExtractedPage,
+    ExtractedTable,
+    ExtractionResult,
+)
+
+
+def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult:
+    return ExtractionResult(
+        markdown="",
+        json_payload={},
+        blocks=blocks or [],
+        tables=tables or [],
+        figures=figures or [],
+        pages=pages or [ExtractedPage(page_number=1, text="")],
+    )
+
+
+def test_chunker_emits_table_unsplit():
+    md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"
+    extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)])
+    chunks = chunk_extraction(extraction)
+    table_chunks = [c for c in chunks if c.block_type == "table"]
+    assert len(table_chunks) == 1
+    assert "| 1 | 2 |" in table_chunks[0].text
+    assert table_chunks[0].page_number == 2
+    assert table_chunks[0].quality_flags["table_detected"] is True
+
+
+def test_chunker_handles_paragraphs_with_section_heading():
+    extraction = _extraction(
+        blocks=[
+            ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"),
+            ExtractedBlock(
+                page_number=1,
+                block_type="paragraph",
+                text="Первый параграф документа " * 30,
+            ),
+            ExtractedBlock(
+                page_number=1,
+                block_type="paragraph",
+                text="Второй параграф продолжает тему " * 30,
+            ),
+        ]
+    )
+    chunks = chunk_extraction(extraction)
+    text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")]
+    assert text_chunks, "expected at least one narrative chunk"
+    # The section heading should be included as context in at least one chunk.
+    assert any("Глава 1" in c.text for c in text_chunks)
+    # Each chunk should carry citation metadata.
+    for c in text_chunks:
+        assert c.page_number == 1
+        assert c.quality_flags is not None
+        assert "needs_manual_review" in c.quality_flags
+
+
+def test_chunker_emits_figure_caption_chunks():
+    extraction = _extraction(
+        figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")]
+    )
+    chunks = chunk_extraction(extraction)
+    fig_chunks = [c for c in chunks if c.block_type.startswith("figure")]
+    assert fig_chunks
+    assert fig_chunks[0].page_number == 4
+    assert "Схема ремонта" in fig_chunks[0].text
+
+
+def test_chunker_splits_giant_block():
+    huge = " ".join(f"word{i}" for i in range(5000))
+    extraction = _extraction(
+        blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)]
+    )
+    chunks = chunk_extraction(extraction)
+    narrative = [c for c in chunks if c.block_type == "paragraph"]
+    assert len(narrative) >= 2  # the giant block must be split