Files
LegacyHUB/tests/test_chunker.py
Vadim Malanov 7f72171572 chore: bootstrap repository with governance docs
Initialize git, add Apache-2.0 LICENSE, .gitattributes (LF line
endings), AGENTS.md (entry points, stack, discovery order, baseline
checks), RUNBOOK.md (dev boot, prod deploy with overlay, ingestion,
failures, rollback, scaling notes), .env.prod.example with rotated
credential placeholders, and dev-only warnings on .env.example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 16:41:50 +03:00

82 lines
3.0 KiB
Python

from __future__ import annotations
from app.ingestion.chunker import chunk_extraction
from app.ingestion.docling_extractor import (
ExtractedBlock,
ExtractedFigure,
ExtractedPage,
ExtractedTable,
ExtractionResult,
)
def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult:
return ExtractionResult(
markdown="",
json_payload={},
blocks=blocks or [],
tables=tables or [],
figures=figures or [],
pages=pages or [ExtractedPage(page_number=1, text="")],
)
def test_chunker_emits_table_unsplit():
md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"
extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)])
chunks = chunk_extraction(extraction)
table_chunks = [c for c in chunks if c.block_type == "table"]
assert len(table_chunks) == 1
assert "| 1 | 2 |" in table_chunks[0].text
assert table_chunks[0].page_number == 2
assert table_chunks[0].quality_flags["table_detected"] is True
def test_chunker_handles_paragraphs_with_section_heading():
extraction = _extraction(
blocks=[
ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"),
ExtractedBlock(
page_number=1,
block_type="paragraph",
text="Первый параграф документа " * 30,
),
ExtractedBlock(
page_number=1,
block_type="paragraph",
text="Второй параграф продолжает тему " * 30,
),
]
)
chunks = chunk_extraction(extraction)
text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")]
assert text_chunks, "expected at least one narrative chunk"
# The section heading should be included as context in at least one chunk.
assert any("Глава 1" in c.text for c in text_chunks)
# Each chunk should carry citation metadata.
for c in text_chunks:
assert c.page_number == 1
assert c.quality_flags is not None
assert "needs_manual_review" in c.quality_flags
def test_chunker_emits_figure_caption_chunks():
extraction = _extraction(
figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")]
)
chunks = chunk_extraction(extraction)
fig_chunks = [c for c in chunks if c.block_type.startswith("figure")]
assert fig_chunks
assert fig_chunks[0].page_number == 4
assert "Схема ремонта" in fig_chunks[0].text
def test_chunker_splits_giant_block():
huge = " ".join(f"word{i}" for i in range(5000))
extraction = _extraction(
blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)]
)
chunks = chunk_extraction(extraction)
narrative = [c for c in chunks if c.block_type == "paragraph"]
assert len(narrative) >= 2 # the giant block must be split