from __future__ import annotations from app.ingestion.chunker import chunk_extraction from app.ingestion.docling_extractor import ( ExtractedBlock, ExtractedFigure, ExtractedPage, ExtractedTable, ExtractionResult, ) def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult: return ExtractionResult( markdown="", json_payload={}, blocks=blocks or [], tables=tables or [], figures=figures or [], pages=pages or [ExtractedPage(page_number=1, text="")], ) def test_chunker_emits_table_unsplit(): md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |" extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)]) chunks = chunk_extraction(extraction) table_chunks = [c for c in chunks if c.block_type == "table"] assert len(table_chunks) == 1 assert "| 1 | 2 |" in table_chunks[0].text assert table_chunks[0].page_number == 2 assert table_chunks[0].quality_flags["table_detected"] is True def test_chunker_handles_paragraphs_with_section_heading(): extraction = _extraction( blocks=[ ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"), ExtractedBlock( page_number=1, block_type="paragraph", text="Первый параграф документа " * 30, ), ExtractedBlock( page_number=1, block_type="paragraph", text="Второй параграф продолжает тему " * 30, ), ] ) chunks = chunk_extraction(extraction) text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")] assert text_chunks, "expected at least one narrative chunk" # The section heading should be included as context in at least one chunk. assert any("Глава 1" in c.text for c in text_chunks) # Each chunk should carry citation metadata. for c in text_chunks: assert c.page_number == 1 assert c.quality_flags is not None assert "needs_manual_review" in c.quality_flags def test_chunker_emits_figure_caption_chunks(): extraction = _extraction( figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")] ) chunks = chunk_extraction(extraction) fig_chunks = [c for c in chunks if c.block_type.startswith("figure")] assert fig_chunks assert fig_chunks[0].page_number == 4 assert "Схема ремонта" in fig_chunks[0].text def test_chunker_splits_giant_block(): huge = " ".join(f"word{i}" for i in range(5000)) extraction = _extraction( blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)] ) chunks = chunk_extraction(extraction) narrative = [c for c in chunks if c.block_type == "paragraph"] assert len(narrative) >= 2 # the giant block must be split