from __future__ import annotations

from app.ingestion.chunker import chunk_extraction
from app.ingestion.docling_extractor import (
    ExtractedBlock,
    ExtractedFigure,
    ExtractedPage,
    ExtractedTable,
    ExtractionResult,
)


def _extraction(blocks=None, tables=None, figures=None, pages=None) -> ExtractionResult:
    return ExtractionResult(
        markdown="",
        json_payload={},
        blocks=blocks or [],
        tables=tables or [],
        figures=figures or [],
        pages=pages or [ExtractedPage(page_number=1, text="")],
    )


def test_chunker_emits_table_unsplit():
    md = "| a | b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"
    extraction = _extraction(tables=[ExtractedTable(page_number=2, table_index=0, markdown=md)])
    chunks = chunk_extraction(extraction)
    table_chunks = [c for c in chunks if c.block_type == "table"]
    assert len(table_chunks) == 1
    assert "| 1 | 2 |" in table_chunks[0].text
    assert table_chunks[0].page_number == 2
    assert table_chunks[0].quality_flags["table_detected"] is True


def test_chunker_handles_paragraphs_with_section_heading():
    extraction = _extraction(
        blocks=[
            ExtractedBlock(page_number=1, block_type="heading", text="Глава 1. Введение"),
            ExtractedBlock(
                page_number=1,
                block_type="paragraph",
                text="Первый параграф документа " * 30,
            ),
            ExtractedBlock(
                page_number=1,
                block_type="paragraph",
                text="Второй параграф продолжает тему " * 30,
            ),
        ]
    )
    chunks = chunk_extraction(extraction)
    text_chunks = [c for c in chunks if c.block_type in ("paragraph", "heading")]
    assert text_chunks, "expected at least one narrative chunk"
    # The section heading should be included as context in at least one chunk.
    assert any("Глава 1" in c.text for c in text_chunks)
    # Each chunk should carry citation metadata.
    for c in text_chunks:
        assert c.page_number == 1
        assert c.quality_flags is not None
        assert "needs_manual_review" in c.quality_flags


def test_chunker_emits_figure_caption_chunks():
    extraction = _extraction(
        figures=[ExtractedFigure(page_number=4, figure_index=0, caption="Схема ремонта")]
    )
    chunks = chunk_extraction(extraction)
    fig_chunks = [c for c in chunks if c.block_type.startswith("figure")]
    assert fig_chunks
    assert fig_chunks[0].page_number == 4
    assert "Схема ремонта" in fig_chunks[0].text


def test_chunker_splits_giant_block():
    huge = " ".join(f"word{i}" for i in range(5000))
    extraction = _extraction(
        blocks=[ExtractedBlock(page_number=1, block_type="paragraph", text=huge)]
    )
    chunks = chunk_extraction(extraction)
    narrative = [c for c in chunks if c.block_type == "paragraph"]
    assert len(narrative) >= 2  # the giant block must be split