Files
dify/api/tests/unit_tests/services/test_summary_index_service.py
2026-03-12 15:34:20 +08:00

1330 lines
49 KiB
Python

"""Unit tests for services.summary_index_service."""
from __future__ import annotations
import sys
from dataclasses import dataclass
from datetime import UTC, datetime
from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
import services.summary_index_service as summary_module
from services.summary_index_service import SummaryIndexService
@dataclass(frozen=True)
class _SessionContext:
session: MagicMock
def __enter__(self) -> MagicMock:
return self.session
def __exit__(self, exc_type, exc, tb) -> None:
return None
def _dataset(*, indexing_technique: str = "high_quality") -> MagicMock:
dataset = MagicMock(name="dataset")
dataset.id = "dataset-1"
dataset.tenant_id = "tenant-1"
dataset.indexing_technique = indexing_technique
dataset.embedding_model_provider = "openai"
dataset.embedding_model = "text-embedding"
return dataset
def _segment(*, has_document: bool = True) -> MagicMock:
segment = MagicMock(name="segment")
segment.id = "seg-1"
segment.document_id = "doc-1"
segment.dataset_id = "dataset-1"
segment.content = "hello world"
segment.enabled = True
segment.status = "completed"
segment.position = 1
if has_document:
doc = MagicMock(name="document")
doc.doc_language = "en"
doc.doc_form = "text_model"
segment.document = doc
else:
segment.document = None
return segment
def _summary_record(*, summary_content: str = "summary", node_id: str | None = None) -> MagicMock:
record = MagicMock(spec=summary_module.DocumentSegmentSummary, name="summary_record")
record.id = "sum-1"
record.dataset_id = "dataset-1"
record.document_id = "doc-1"
record.chunk_id = "seg-1"
record.summary_content = summary_content
record.summary_index_node_id = node_id
record.summary_index_node_hash = None
record.tokens = None
record.status = "generating"
record.error = None
record.enabled = True
record.created_at = datetime(2024, 1, 1, tzinfo=UTC)
record.updated_at = datetime(2024, 1, 1, tzinfo=UTC)
record.disabled_at = None
record.disabled_by = None
return record
def test_generate_summary_for_segment_passes_document_language(monkeypatch: pytest.MonkeyPatch) -> None:
usage = MagicMock()
usage.total_tokens = 10
usage.prompt_tokens = 3
usage.completion_tokens = 7
paragraph_module = SimpleNamespace(
ParagraphIndexProcessor=SimpleNamespace(generate_summary=MagicMock(return_value=("sum", usage)))
)
monkeypatch.setitem(
sys.modules,
"core.rag.index_processor.processor.paragraph_index_processor",
paragraph_module,
)
segment = _segment(has_document=True)
dataset = _dataset()
content, got_usage = SummaryIndexService.generate_summary_for_segment(segment, dataset, {"a": 1})
assert content == "sum"
assert got_usage is usage
paragraph_module.ParagraphIndexProcessor.generate_summary.assert_called_once()
_, kwargs = paragraph_module.ParagraphIndexProcessor.generate_summary.call_args
assert kwargs["document_language"] == "en"
def test_generate_summary_for_segment_raises_when_empty(monkeypatch: pytest.MonkeyPatch) -> None:
paragraph_module = SimpleNamespace(
ParagraphIndexProcessor=SimpleNamespace(generate_summary=MagicMock(return_value=("", MagicMock())))
)
monkeypatch.setitem(
sys.modules,
"core.rag.index_processor.processor.paragraph_index_processor",
paragraph_module,
)
with pytest.raises(ValueError, match="Generated summary is empty"):
SummaryIndexService.generate_summary_for_segment(_segment(), _dataset(), {"a": 1})
def test_create_summary_record_updates_existing_and_reenables(monkeypatch: pytest.MonkeyPatch) -> None:
existing = _summary_record(summary_content="old", node_id="n1")
existing.enabled = False
existing.disabled_at = datetime(2024, 1, 1)
existing.disabled_by = "u"
session = MagicMock(name="session")
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = existing
session.query.return_value = query
create_session_mock = MagicMock(return_value=_SessionContext(session))
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
segment = _segment()
dataset = _dataset()
result = SummaryIndexService.create_summary_record(segment, dataset, "new", status="generating")
assert result is existing
assert existing.summary_content == "new"
assert existing.status == "generating"
assert existing.enabled is True
assert existing.disabled_at is None
assert existing.disabled_by is None
assert existing.error is None
session.add.assert_called_once_with(existing)
session.flush.assert_called_once()
def test_create_summary_record_creates_new(monkeypatch: pytest.MonkeyPatch) -> None:
session = MagicMock(name="session")
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
create_session_mock = MagicMock(return_value=_SessionContext(session))
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
record = SummaryIndexService.create_summary_record(_segment(), _dataset(), "new", status="generating")
assert record.dataset_id == "dataset-1"
assert record.chunk_id == "seg-1"
assert record.summary_content == "new"
assert record.enabled is True
session.add.assert_called_once()
session.flush.assert_called_once()
def test_vectorize_summary_skips_non_high_quality(monkeypatch: pytest.MonkeyPatch) -> None:
vector_cls = MagicMock()
monkeypatch.setattr(summary_module, "Vector", vector_cls)
SummaryIndexService.vectorize_summary(_summary_record(), _segment(), _dataset(indexing_technique="economy"))
vector_cls.assert_not_called()
def test_vectorize_summary_raises_for_blank_content() -> None:
with pytest.raises(ValueError, match="Summary content is empty"):
SummaryIndexService.vectorize_summary(_summary_record(summary_content=" "), _segment(), _dataset())
def test_vectorize_summary_retries_connection_errors_then_succeeds(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
embedding_model = MagicMock()
embedding_model.get_text_embedding_num_tokens.return_value = [5]
model_manager = MagicMock()
model_manager.get_model_instance.return_value = embedding_model
monkeypatch.setattr(summary_module, "ModelManager", MagicMock(return_value=model_manager))
vector_instance = MagicMock()
vector_instance.add_texts.side_effect = [RuntimeError("connection timeout"), None]
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
session = MagicMock(name="provided_session")
merged = _summary_record(summary_content="sum")
session.merge.return_value = merged
monkeypatch.setattr(summary_module.time, "sleep", MagicMock())
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=session)
assert vector_instance.add_texts.call_count == 2
summary_module.time.sleep.assert_called_once() # type: ignore[attr-defined]
session.flush.assert_called_once()
assert summary.status == "completed"
assert summary.summary_index_node_id == "uuid-1"
assert summary.summary_index_node_hash == "hash-1"
assert summary.tokens == 5
def test_vectorize_summary_without_session_creates_record_when_missing(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id="old-node")
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
# Force deletion branch to run and swallow delete failures.
vector_for_delete = MagicMock()
vector_for_delete.delete_by_ids.side_effect = RuntimeError("delete failed")
vector_for_add = MagicMock()
vector_for_add.add_texts.return_value = None
vector_cls = MagicMock(side_effect=[vector_for_delete, vector_for_add])
monkeypatch.setattr(summary_module, "Vector", vector_cls)
model_manager = MagicMock()
model_manager.get_model_instance.side_effect = RuntimeError("no model")
monkeypatch.setattr(summary_module, "ModelManager", MagicMock(return_value=model_manager))
# New session used after vectorization succeeds (record not found by id nor chunk_id).
session = MagicMock(name="session")
q1 = MagicMock()
q1.filter_by.return_value = q1
q1.first.side_effect = [None, None]
session.query.return_value = q1
create_session_mock = MagicMock(return_value=_SessionContext(session))
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
# One context for success path, no error handler session.
create_session_mock.assert_called()
session.add.assert_called()
session.commit.assert_called_once()
assert summary.status == "completed"
assert summary.summary_index_node_id == "old-node" # reused
def test_vectorize_summary_final_failure_updates_error_status(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
monkeypatch.setattr(summary_module.time, "sleep", MagicMock())
vector_instance = MagicMock()
vector_instance.add_texts.side_effect = RuntimeError("boom")
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
# error_session should find record and commit status update
error_session = MagicMock(name="error_session")
q = MagicMock()
q.filter_by.return_value = q
q.first.return_value = summary
error_session.query.return_value = q
create_session_mock = MagicMock(return_value=_SessionContext(error_session))
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
with pytest.raises(RuntimeError, match="boom"):
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
assert summary.status == "error"
assert "Vectorization failed" in (summary.error or "")
error_session.commit.assert_called_once()
def test_batch_create_summary_records_no_segments_noop(monkeypatch: pytest.MonkeyPatch) -> None:
create_session_mock = MagicMock()
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
SummaryIndexService.batch_create_summary_records([], _dataset())
create_session_mock.assert_not_called()
def test_batch_create_summary_records_creates_and_updates(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
s1 = _segment()
s2 = _segment()
s2.id = "seg-2"
s2.document_id = "doc-2"
existing = _summary_record()
existing.chunk_id = "seg-2"
existing.enabled = False
session = MagicMock()
query = MagicMock()
query.filter.return_value = query
query.all.return_value = [existing]
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.batch_create_summary_records([s1, s2], dataset, status="not_started")
session.commit.assert_called_once()
assert existing.enabled is True
def test_update_summary_record_error_updates_when_exists(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.update_summary_record_error(segment, dataset, "err")
assert record.status == "error"
assert record.error == "err"
session.commit.assert_called_once()
def test_generate_and_vectorize_summary_success(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(
SummaryIndexService, "generate_summary_for_segment", MagicMock(return_value=("sum", MagicMock(total_tokens=0)))
)
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(return_value=None))
out = SummaryIndexService.generate_and_vectorize_summary(segment, dataset, {"enable": True})
assert out is record
session.refresh.assert_called_once_with(record)
session.commit.assert_called()
def test_generate_and_vectorize_summary_vectorize_failure_sets_error(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(
SummaryIndexService, "generate_summary_for_segment", MagicMock(return_value=("sum", MagicMock(total_tokens=0)))
)
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(side_effect=RuntimeError("boom")))
with pytest.raises(RuntimeError, match="boom"):
SummaryIndexService.generate_and_vectorize_summary(segment, dataset, {"enable": True})
assert record.status == "error"
# Outer exception handler overwrites the error with the raw exception message.
assert record.error == "boom"
def test_vectorize_summary_updates_existing_record_found_by_chunk_id(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
vector_instance = MagicMock()
vector_instance.add_texts.return_value = None
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
monkeypatch.setattr(
summary_module,
"ModelManager",
MagicMock(return_value=MagicMock(get_model_instance=MagicMock(return_value=None))),
)
existing = _summary_record(summary_content="old", node_id="old-node")
existing.id = "other-id"
session = MagicMock(name="session")
q = MagicMock()
q.filter_by.return_value = q
q.first.side_effect = [None, existing] # miss by id, hit by chunk_id
session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
session.commit.assert_called_once()
assert existing.summary_index_node_id == "uuid-1"
def test_vectorize_summary_updates_existing_record_found_by_id(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
monkeypatch.setattr(
summary_module, "Vector", MagicMock(return_value=MagicMock(add_texts=MagicMock(return_value=None)))
)
monkeypatch.setattr(
summary_module,
"ModelManager",
MagicMock(return_value=MagicMock(get_model_instance=MagicMock(return_value=None))),
)
existing = _summary_record(summary_content="old", node_id="old-node")
session = MagicMock(name="session")
q = MagicMock()
q.filter_by.return_value = q
q.first.return_value = existing # hit by id
session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
session.commit.assert_called_once()
assert existing.summary_index_node_hash == "hash-1"
def test_vectorize_summary_session_enter_returns_none_triggers_runtime_error(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
monkeypatch.setattr(
summary_module, "Vector", MagicMock(return_value=MagicMock(add_texts=MagicMock(return_value=None)))
)
monkeypatch.setattr(
summary_module,
"ModelManager",
MagicMock(return_value=MagicMock(get_model_instance=MagicMock(return_value=None))),
)
class _BadContext:
def __enter__(self):
return None
def __exit__(self, exc_type, exc, tb) -> None:
return None
error_session = MagicMock()
q = MagicMock()
q.filter_by.return_value = q
q.first.return_value = summary
error_session.query.return_value = q
create_session_mock = MagicMock(side_effect=[_BadContext(), _SessionContext(error_session)])
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
with pytest.raises(RuntimeError, match="Session should not be None"):
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
def test_vectorize_summary_created_record_becomes_none_triggers_guard(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
monkeypatch.setattr(
summary_module, "Vector", MagicMock(return_value=MagicMock(add_texts=MagicMock(return_value=None)))
)
monkeypatch.setattr(
summary_module,
"ModelManager",
MagicMock(return_value=MagicMock(get_model_instance=MagicMock(return_value=None))),
)
session = MagicMock()
q = MagicMock()
q.filter_by.return_value = q
q.first.side_effect = [None, None] # miss by id and chunk_id
session.query.return_value = q
error_session = MagicMock()
eq = MagicMock()
eq.filter_by.return_value = eq
eq.first.return_value = summary
error_session.query.return_value = eq
create_session_mock = MagicMock(side_effect=[_SessionContext(session), _SessionContext(error_session)])
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
# Force the created record to be None so the "should not be None" guard triggers.
monkeypatch.setattr(summary_module, "DocumentSegmentSummary", MagicMock(return_value=None))
with pytest.raises(RuntimeError, match="summary_record_in_session should not be None"):
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
def test_vectorize_summary_error_handler_tries_chunk_id_lookup_and_can_warn_not_found(
monkeypatch: pytest.MonkeyPatch,
) -> None:
dataset = _dataset()
segment = _segment()
summary = _summary_record(summary_content="sum", node_id=None)
monkeypatch.setattr(summary_module.uuid, "uuid4", MagicMock(return_value="uuid-1"))
monkeypatch.setattr(summary_module.helper, "generate_text_hash", MagicMock(return_value="hash-1"))
monkeypatch.setattr(summary_module.time, "sleep", MagicMock())
monkeypatch.setattr(
summary_module,
"Vector",
MagicMock(return_value=MagicMock(add_texts=MagicMock(side_effect=RuntimeError("boom")))),
)
error_session = MagicMock(name="error_session")
q = MagicMock()
q.filter_by.return_value = q
q.first.side_effect = [None, None] # not found by id, not found by chunk_id
error_session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(error_session))),
)
with pytest.raises(RuntimeError, match="boom"):
SummaryIndexService.vectorize_summary(summary, segment, dataset, session=None)
# No record -> no commit in error session.
error_session.commit.assert_not_called()
def test_update_summary_record_error_warns_when_missing(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
logger_mock = MagicMock()
monkeypatch.setattr(summary_module, "logger", logger_mock)
SummaryIndexService.update_summary_record_error(segment, dataset, "err")
logger_mock.warning.assert_called_once()
def test_generate_and_vectorize_summary_creates_missing_record_and_logs_usage(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
usage = MagicMock(total_tokens=4, prompt_tokens=1, completion_tokens=3)
monkeypatch.setattr(SummaryIndexService, "generate_summary_for_segment", MagicMock(return_value=("sum", usage)))
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(return_value=None))
logger_mock = MagicMock()
monkeypatch.setattr(summary_module, "logger", logger_mock)
result = SummaryIndexService.generate_and_vectorize_summary(segment, dataset, {"enable": True})
assert result.status in {"generating", "completed"}
logger_mock.info.assert_called()
def test_generate_summaries_for_document_skip_conditions(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset(indexing_technique="economy")
document = MagicMock(spec=summary_module.DatasetDocument)
document.id = "doc-1"
document.doc_form = "text_model"
assert SummaryIndexService.generate_summaries_for_document(dataset, document, {"enable": True}) == []
dataset = _dataset()
assert SummaryIndexService.generate_summaries_for_document(dataset, document, {"enable": False}) == []
document.doc_form = "qa_model"
assert SummaryIndexService.generate_summaries_for_document(dataset, document, {"enable": True}) == []
def test_generate_summaries_for_document_runs_and_handles_errors(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
document = MagicMock(spec=summary_module.DatasetDocument)
document.id = "doc-1"
document.doc_form = "text_model"
seg1 = _segment()
seg2 = _segment()
seg2.id = "seg-2"
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = [seg1, seg2]
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(SummaryIndexService, "batch_create_summary_records", MagicMock())
monkeypatch.setattr(
SummaryIndexService,
"generate_and_vectorize_summary",
MagicMock(side_effect=[MagicMock(), RuntimeError("boom")]),
)
update_err_mock = MagicMock()
monkeypatch.setattr(SummaryIndexService, "update_summary_record_error", update_err_mock)
records = SummaryIndexService.generate_summaries_for_document(dataset, document, {"enable": True})
assert len(records) == 1
update_err_mock.assert_called_once()
def test_generate_summaries_for_document_no_segments_returns_empty(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
document = MagicMock(spec=summary_module.DatasetDocument)
document.id = "doc-1"
document.doc_form = "text_model"
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = []
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
assert SummaryIndexService.generate_summaries_for_document(dataset, document, {"enable": True}) == []
def test_generate_summaries_for_document_applies_segment_ids_and_only_parent_chunks(
monkeypatch: pytest.MonkeyPatch,
) -> None:
dataset = _dataset()
document = MagicMock(spec=summary_module.DatasetDocument)
document.id = "doc-1"
document.doc_form = "text_model"
seg = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = [seg]
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(SummaryIndexService, "batch_create_summary_records", MagicMock())
monkeypatch.setattr(SummaryIndexService, "generate_and_vectorize_summary", MagicMock(return_value=MagicMock()))
SummaryIndexService.generate_summaries_for_document(
dataset,
document,
{"enable": True},
segment_ids=[seg.id],
only_parent_chunks=True,
)
query.filter.assert_called()
def test_disable_summaries_for_segments_handles_vector_delete_error(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
summary1 = _summary_record(summary_content="s", node_id="n1")
summary2 = _summary_record(summary_content="s", node_id=None)
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = [summary1, summary2]
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(
summary_module,
"Vector",
MagicMock(return_value=MagicMock(delete_by_ids=MagicMock(side_effect=RuntimeError("boom")))),
)
monkeypatch.setitem(
sys.modules, "libs.datetime_utils", SimpleNamespace(naive_utc_now=MagicMock(return_value=datetime(2024, 1, 1)))
)
SummaryIndexService.disable_summaries_for_segments(dataset, segment_ids=["seg-1"], disabled_by="u")
assert summary1.enabled is False
assert summary1.disabled_by == "u"
session.commit.assert_called_once()
def test_disable_summaries_for_segments_no_summaries_noop(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = []
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setitem(
sys.modules, "libs.datetime_utils", SimpleNamespace(naive_utc_now=MagicMock(return_value=datetime(2024, 1, 1)))
)
SummaryIndexService.disable_summaries_for_segments(dataset)
session.commit.assert_not_called()
def test_enable_summaries_for_segments_skips_non_high_quality() -> None:
SummaryIndexService.enable_summaries_for_segments(_dataset(indexing_technique="economy"))
def test_enable_summaries_for_segments_revectorizes_and_enables(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
summary = _summary_record(summary_content="sum", node_id="n1")
summary.enabled = False
segment = _segment()
segment.id = summary.chunk_id
segment.enabled = True
segment.status = "completed"
session = MagicMock()
summary_query = MagicMock()
summary_query.filter_by.return_value = summary_query
summary_query.filter.return_value = summary_query
summary_query.all.return_value = [summary]
seg_query = MagicMock()
seg_query.filter_by.return_value = seg_query
seg_query.first.return_value = segment
def query_side_effect(model: object) -> MagicMock:
if model is summary_module.DocumentSegmentSummary:
return summary_query
return seg_query
session.query.side_effect = query_side_effect
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
vec_mock = MagicMock()
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", vec_mock)
SummaryIndexService.enable_summaries_for_segments(dataset, segment_ids=[summary.chunk_id])
vec_mock.assert_called_once()
assert summary.enabled is True
session.commit.assert_called_once()
def test_enable_summaries_for_segments_no_summaries_noop(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = []
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.enable_summaries_for_segments(dataset)
session.commit.assert_not_called()
def test_enable_summaries_for_segments_skips_segment_or_content_and_handles_vectorize_error(
monkeypatch: pytest.MonkeyPatch,
) -> None:
dataset = _dataset()
summary1 = _summary_record(summary_content="sum", node_id="n1")
summary1.enabled = False
summary2 = _summary_record(summary_content="", node_id="n2")
summary2.enabled = False
summary3 = _summary_record(summary_content="sum3", node_id="n3")
summary3.enabled = False
bad_segment = _segment()
bad_segment.enabled = False
bad_segment.status = "completed"
good_segment = _segment()
good_segment.enabled = True
good_segment.status = "completed"
session = MagicMock()
summary_query = MagicMock()
summary_query.filter_by.return_value = summary_query
summary_query.filter.return_value = summary_query
summary_query.all.return_value = [summary1, summary2, summary3]
seg_query = MagicMock()
seg_query.filter_by.return_value = seg_query
seg_query.first.side_effect = [bad_segment, good_segment, good_segment]
def query_side_effect(model: object) -> MagicMock:
if model is summary_module.DocumentSegmentSummary:
return summary_query
return seg_query
session.query.side_effect = query_side_effect
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
logger_mock = MagicMock()
monkeypatch.setattr(summary_module, "logger", logger_mock)
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(side_effect=RuntimeError("boom")))
SummaryIndexService.enable_summaries_for_segments(dataset)
logger_mock.exception.assert_called_once()
session.commit.assert_called_once()
def test_delete_summaries_for_segments_deletes_vectors_and_records(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
summary = _summary_record(summary_content="sum", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = [summary]
session.query.return_value = query
vector_instance = MagicMock()
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.delete_summaries_for_segments(dataset, segment_ids=[summary.chunk_id])
vector_instance.delete_by_ids.assert_called_once_with(["n1"])
session.delete.assert_called_once_with(summary)
session.commit.assert_called_once()
def test_delete_summaries_for_segments_no_summaries_noop(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.filter.return_value = query
query.all.return_value = []
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
SummaryIndexService.delete_summaries_for_segments(dataset)
session.commit.assert_not_called()
def test_update_summary_for_segment_skip_conditions() -> None:
assert (
SummaryIndexService.update_summary_for_segment(_segment(), _dataset(indexing_technique="economy"), "x") is None
)
seg = _segment(has_document=True)
seg.document.doc_form = "qa_model"
assert SummaryIndexService.update_summary_for_segment(seg, _dataset(), "x") is None
def test_update_summary_for_segment_empty_content_deletes_existing(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
vector_instance = MagicMock()
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
assert SummaryIndexService.update_summary_for_segment(segment, dataset, " ") is None
vector_instance.delete_by_ids.assert_called_once_with(["n1"])
session.delete.assert_called_once_with(record)
session.commit.assert_called_once()
def test_update_summary_for_segment_empty_content_delete_vector_warns(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
vector_instance = MagicMock()
vector_instance.delete_by_ids.side_effect = RuntimeError("boom")
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
logger_mock = MagicMock()
monkeypatch.setattr(summary_module, "logger", logger_mock)
assert SummaryIndexService.update_summary_for_segment(segment, dataset, "") is None
logger_mock.warning.assert_called()
def test_update_summary_for_segment_empty_content_no_record_noop(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
assert SummaryIndexService.update_summary_for_segment(segment, dataset, " ") is None
def test_update_summary_for_segment_updates_existing_and_vectorizes(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
vector_instance = MagicMock()
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
vectorize_mock = MagicMock()
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", vectorize_mock)
out = SummaryIndexService.update_summary_for_segment(segment, dataset, "new summary")
assert out is record
vectorize_mock.assert_called_once()
session.refresh.assert_called_once_with(record)
session.commit.assert_called()
def test_update_summary_for_segment_existing_vector_delete_warns(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
vector_instance = MagicMock()
vector_instance.delete_by_ids.side_effect = RuntimeError("boom")
monkeypatch.setattr(summary_module, "Vector", MagicMock(return_value=vector_instance))
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(return_value=None))
logger_mock = MagicMock()
monkeypatch.setattr(summary_module, "logger", logger_mock)
SummaryIndexService.update_summary_for_segment(segment, dataset, "new")
logger_mock.warning.assert_called()
def test_update_summary_for_segment_existing_vectorize_failure_returns_error_record(
monkeypatch: pytest.MonkeyPatch,
) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(side_effect=RuntimeError("boom")))
out = SummaryIndexService.update_summary_for_segment(segment, dataset, "new")
assert out is record
assert out.status == "error"
assert "Vectorization failed" in (out.error or "")
def test_update_summary_for_segment_new_record_success(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
created = _summary_record(summary_content="new", node_id=None)
monkeypatch.setattr(SummaryIndexService, "create_summary_record", MagicMock(return_value=created))
session.merge.return_value = created
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", MagicMock(return_value=None))
out = SummaryIndexService.update_summary_for_segment(segment, dataset, "new")
assert out is created
session.refresh.assert_called()
session.commit.assert_called()
def test_update_summary_for_segment_outer_exception_sets_error_and_reraises(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = _dataset()
segment = _segment()
record = _summary_record(summary_content="old", node_id="n1")
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = record
session.query.return_value = query
session.flush.side_effect = RuntimeError("flush boom")
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
with pytest.raises(RuntimeError, match="flush boom"):
SummaryIndexService.update_summary_for_segment(segment, dataset, "new")
assert record.status == "error"
assert record.error == "flush boom"
session.commit.assert_called()
def test_get_segment_summary_and_document_summaries(monkeypatch: pytest.MonkeyPatch) -> None:
record = _summary_record(summary_content="sum", node_id="n1")
session = MagicMock()
q1 = MagicMock()
q1.where.return_value = q1
q1.first.return_value = record
q2 = MagicMock()
q2.filter.return_value = q2
q2.all.return_value = [record]
def query_side_effect(model: object) -> MagicMock:
if model is summary_module.DocumentSegmentSummary:
# first call used by get_segment_summary, second by get_document_summaries
if not hasattr(query_side_effect, "_called"):
query_side_effect._called = True # type: ignore[attr-defined]
return q1
return q2
return MagicMock()
session.query.side_effect = query_side_effect
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
assert SummaryIndexService.get_segment_summary("seg-1", "dataset-1") is record
assert SummaryIndexService.get_document_summaries("doc-1", "dataset-1", segment_ids=["seg-1"]) == [record]
def test_get_segments_summaries_non_empty(monkeypatch: pytest.MonkeyPatch) -> None:
record1 = _summary_record()
record1.chunk_id = "seg-1"
record2 = _summary_record()
record2.chunk_id = "seg-2"
session = MagicMock()
q = MagicMock()
q.where.return_value = q
q.all.return_value = [record1, record2]
session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
out = SummaryIndexService.get_segments_summaries(["seg-1", "seg-2"], "dataset-1")
assert set(out.keys()) == {"seg-1", "seg-2"}
def test_get_document_summary_index_status_no_segments_returns_none(monkeypatch: pytest.MonkeyPatch) -> None:
session = MagicMock()
q = MagicMock()
q.where.return_value = q
q.all.return_value = []
session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
assert SummaryIndexService.get_document_summary_index_status("doc-1", "dataset-1", "tenant-1") is None
def test_get_documents_summary_index_status_empty_input(monkeypatch: pytest.MonkeyPatch) -> None:
assert SummaryIndexService.get_documents_summary_index_status([], "dataset-1", "tenant-1") == {}
def test_get_documents_summary_index_status_no_pending_sets_none(monkeypatch: pytest.MonkeyPatch) -> None:
session = MagicMock()
q = MagicMock()
q.where.return_value = q
q.all.return_value = [SimpleNamespace(id="seg-1", document_id="doc-1")]
session.query.return_value = q
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
monkeypatch.setattr(
SummaryIndexService,
"get_segments_summaries",
MagicMock(return_value={"seg-1": SimpleNamespace(status="completed")}),
)
result = SummaryIndexService.get_documents_summary_index_status(["doc-1"], "dataset-1", "tenant-1")
assert result["doc-1"] is None
def test_update_summary_for_segment_creates_new_and_vectorize_fails_returns_error_record(
monkeypatch: pytest.MonkeyPatch,
) -> None:
dataset = _dataset()
segment = _segment()
session = MagicMock()
query = MagicMock()
query.filter_by.return_value = query
query.first.return_value = None
session.query.return_value = query
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session))),
)
created = _summary_record(summary_content="new", node_id=None)
monkeypatch.setattr(SummaryIndexService, "create_summary_record", MagicMock(return_value=created))
session.merge.return_value = created
vectorize_mock = MagicMock(side_effect=RuntimeError("boom"))
monkeypatch.setattr(SummaryIndexService, "vectorize_summary", vectorize_mock)
out = SummaryIndexService.update_summary_for_segment(segment, dataset, "new")
assert out.status == "error"
assert "Vectorization failed" in (out.error or "")
def test_get_segments_summaries_empty_list() -> None:
assert SummaryIndexService.get_segments_summaries([], "dataset-1") == {}
def test_get_document_summary_index_status_and_documents_status(monkeypatch: pytest.MonkeyPatch) -> None:
seg_row = SimpleNamespace(id="seg-1", document_id="doc-1")
session = MagicMock()
query = MagicMock()
query.where.return_value = query
query.all.return_value = [SimpleNamespace(id="seg-1")]
session.query.return_value = query
create_session_mock = MagicMock(return_value=_SessionContext(session))
monkeypatch.setattr(summary_module, "session_factory", SimpleNamespace(create_session=create_session_mock))
monkeypatch.setattr(
SummaryIndexService,
"get_segments_summaries",
MagicMock(return_value={"seg-1": SimpleNamespace(status="generating")}),
)
assert SummaryIndexService.get_document_summary_index_status("doc-1", "dataset-1", "tenant-1") == "SUMMARIZING"
# Multiple docs
query2 = MagicMock()
query2.where.return_value = query2
query2.all.return_value = [seg_row]
session2 = MagicMock()
session2.query.return_value = query2
monkeypatch.setattr(
summary_module,
"session_factory",
SimpleNamespace(create_session=MagicMock(return_value=_SessionContext(session2))),
)
monkeypatch.setattr(
SummaryIndexService,
"get_segments_summaries",
MagicMock(return_value={"seg-1": SimpleNamespace(status="not_started")}),
)
result = SummaryIndexService.get_documents_summary_index_status(["doc-1", "doc-2"], "dataset-1", "tenant-1")
assert result["doc-1"] == "SUMMARIZING"
assert result["doc-2"] is None
def test_get_document_summary_status_detail_counts_and_previews(monkeypatch: pytest.MonkeyPatch) -> None:
segment1 = _segment()
segment1.id = "seg-1"
segment1.position = 1
segment2 = _segment()
segment2.id = "seg-2"
segment2.position = 2
summary1 = _summary_record(summary_content="x" * 150, node_id="n1")
summary1.chunk_id = "seg-1"
summary1.status = "completed"
summary1.error = None
summary1.created_at = datetime(2024, 1, 1, tzinfo=UTC)
summary1.updated_at = datetime(2024, 1, 2, tzinfo=UTC)
segment_service = SimpleNamespace(get_segments_by_document_and_dataset=MagicMock(return_value=[segment1, segment2]))
monkeypatch.setitem(sys.modules, "services.dataset_service", SimpleNamespace(SegmentService=segment_service))
monkeypatch.setattr(SummaryIndexService, "get_document_summaries", MagicMock(return_value=[summary1]))
detail = SummaryIndexService.get_document_summary_status_detail("doc-1", "dataset-1")
assert detail["total_segments"] == 2
assert detail["summary_status"]["completed"] == 1
assert detail["summary_status"]["not_started"] == 1
assert detail["summaries"][0]["summary_preview"].endswith("...")
assert detail["summaries"][1]["status"] == "not_started"