Files
dify/api/tests/unit_tests/services/test_dataset_service_document.py

2079 lines
94 KiB
Python

"""Unit tests for DocumentService behaviors in dataset_service."""
from .dataset_service_test_helpers import (
Account,
BuiltInField,
CloudPlan,
DatasetProcessRule,
DatasetService,
DatasetServiceUnitDataFactory,
DataSource,
DocumentIndexingError,
DocumentService,
FileInfo,
FileNotExistsError,
Forbidden,
IndexStructureType,
InfoList,
KnowledgeConfig,
MagicMock,
NoPermissionError,
NotFound,
NotionIcon,
NotionInfo,
NotionPage,
PreProcessingRule,
ProcessRule,
RerankingModel,
RetrievalMethod,
RetrievalModel,
Rule,
Segmentation,
SimpleNamespace,
WebsiteInfo,
_make_dataset,
_make_document,
_make_features,
_make_lock_context,
_make_session_context,
_make_upload_knowledge_config,
create_autospec,
json,
patch,
pytest,
)
class TestDocumentServiceDisplayStatus:
"""Unit tests for DocumentService display-status helpers."""
@pytest.mark.parametrize(
("raw_status", "expected"),
[
("enabled", "available"),
("AVAILABLE", "available"),
("paused", "paused"),
("unknown", None),
(None, None),
],
)
def test_normalize_display_status(self, raw_status, expected):
assert DocumentService.normalize_display_status(raw_status) == expected
def test_build_display_status_filters_returns_empty_tuple_for_unknown_status(self):
assert DocumentService.build_display_status_filters("missing") == ()
def test_apply_display_status_filter_returns_original_query_for_unknown_status(self):
query = MagicMock()
result = DocumentService.apply_display_status_filter(query, "missing")
assert result is query
query.where.assert_not_called()
def test_apply_display_status_filter_applies_where_for_known_status(self):
query = MagicMock()
filtered_query = MagicMock()
query.where.return_value = filtered_query
result = DocumentService.apply_display_status_filter(query, "enabled")
assert result is filtered_query
query.where.assert_called_once()
class TestDocumentServiceQueryAndDownloadHelpers:
"""Unit tests for DocumentService query helpers and download flows."""
def test_get_document_returns_none_when_document_id_is_missing(self):
with patch("services.dataset_service.db") as mock_db:
result = DocumentService.get_document("dataset-1", None)
assert result is None
mock_db.session.query.assert_not_called()
def test_get_document_queries_by_dataset_and_document_id(self):
document = DatasetServiceUnitDataFactory.create_document_mock()
with patch("services.dataset_service.db") as mock_db:
mock_db.session.query.return_value.where.return_value.first.return_value = document
result = DocumentService.get_document("dataset-1", "doc-1")
assert result is document
def test_get_documents_by_ids_returns_empty_for_empty_input(self):
with patch("services.dataset_service.db") as mock_db:
result = DocumentService.get_documents_by_ids("dataset-1", [])
assert result == []
mock_db.session.scalars.assert_not_called()
def test_get_documents_by_ids_uses_single_batch_query(self):
document = DatasetServiceUnitDataFactory.create_document_mock()
with patch("services.dataset_service.db") as mock_db:
mock_db.session.scalars.return_value.all.return_value = [document]
result = DocumentService.get_documents_by_ids("dataset-1", ["doc-1"])
assert result == [document]
mock_db.session.scalars.assert_called_once()
def test_update_documents_need_summary_returns_zero_for_empty_input(self):
with patch("services.dataset_service.session_factory") as session_factory_mock:
result = DocumentService.update_documents_need_summary("dataset-1", [])
assert result == 0
session_factory_mock.create_session.assert_not_called()
def test_update_documents_need_summary_updates_matching_documents_and_commits(self):
session = MagicMock()
session.query.return_value.filter.return_value.update.return_value = 2
with patch("services.dataset_service.session_factory") as session_factory_mock:
session_factory_mock.create_session.return_value = _make_session_context(session)
result = DocumentService.update_documents_need_summary(
"dataset-1",
["doc-1", "doc-2"],
need_summary=False,
)
assert result == 2
session.commit.assert_called_once()
def test_get_document_download_url_uses_upload_file_lookup_and_signed_url_helper(self):
upload_file = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-1")
document = DatasetServiceUnitDataFactory.create_document_mock()
with (
patch.object(DocumentService, "_get_upload_file_for_upload_file_document", return_value=upload_file),
patch("services.dataset_service.file_helpers.get_signed_file_url", return_value="signed-url") as get_url,
):
result = DocumentService.get_document_download_url(document)
assert result == "signed-url"
get_url.assert_called_once_with(upload_file_id="file-1", as_attachment=True)
def test_get_upload_file_id_for_upload_file_document_rejects_invalid_source_type(self):
document = DatasetServiceUnitDataFactory.create_document_mock(data_source_type="not-upload-file")
with pytest.raises(NotFound, match="invalid source"):
DocumentService._get_upload_file_id_for_upload_file_document(
document,
invalid_source_message="invalid source",
missing_file_message="missing file",
)
def test_get_upload_file_id_for_upload_file_document_rejects_missing_upload_file_id(self):
document = DatasetServiceUnitDataFactory.create_document_mock(data_source_info_dict={})
with pytest.raises(NotFound, match="missing file"):
DocumentService._get_upload_file_id_for_upload_file_document(
document,
invalid_source_message="invalid source",
missing_file_message="missing file",
)
def test_get_upload_file_id_for_upload_file_document_returns_string_id(self):
document = DatasetServiceUnitDataFactory.create_document_mock(data_source_info_dict={"upload_file_id": 99})
result = DocumentService._get_upload_file_id_for_upload_file_document(
document,
invalid_source_message="invalid source",
missing_file_message="missing file",
)
assert result == "99"
def test_get_upload_file_for_upload_file_document_raises_when_file_service_returns_nothing(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
tenant_id="tenant-1",
data_source_info_dict={"upload_file_id": "file-1"},
)
with patch("services.dataset_service.FileService.get_upload_files_by_ids", return_value={}):
with pytest.raises(NotFound, match="Uploaded file not found"):
DocumentService._get_upload_file_for_upload_file_document(document)
def test_get_upload_file_for_upload_file_document_returns_upload_file(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
tenant_id="tenant-1",
data_source_info_dict={"upload_file_id": "file-1"},
)
upload_file = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-1")
with patch(
"services.dataset_service.FileService.get_upload_files_by_ids", return_value={"file-1": upload_file}
):
result = DocumentService._get_upload_file_for_upload_file_document(document)
assert result is upload_file
def test_enrich_documents_with_summary_index_status_skips_lookup_when_summary_is_disabled(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock(summary_index_setting={"enable": False})
documents = [
DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-1", need_summary=True),
DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-2", need_summary=False),
]
DocumentService.enrich_documents_with_summary_index_status(documents, dataset, tenant_id="tenant-1")
assert documents[0].summary_index_status is None
assert documents[1].summary_index_status is None
def test_enrich_documents_with_summary_index_status_applies_summary_status_map(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock(
dataset_id="dataset-1",
summary_index_setting={"enable": True},
)
documents = [
DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-1", need_summary=True),
DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-2", need_summary=True),
DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-3", need_summary=False),
]
with patch(
"services.summary_index_service.SummaryIndexService.get_documents_summary_index_status",
return_value={"doc-1": "completed", "doc-2": None},
) as get_status_map:
DocumentService.enrich_documents_with_summary_index_status(documents, dataset, tenant_id="tenant-1")
get_status_map.assert_called_once_with(
document_ids=["doc-1", "doc-2"],
dataset_id="dataset-1",
tenant_id="tenant-1",
)
assert documents[0].summary_index_status == "completed"
assert documents[1].summary_index_status is None
assert documents[2].summary_index_status is None
def test_generate_document_batch_download_zip_filename_uses_zip_extension(self):
fake_uuid = SimpleNamespace(hex="archive-id")
with patch("services.dataset_service.uuid.uuid4", return_value=fake_uuid):
result = DocumentService._generate_document_batch_download_zip_filename()
assert result == "archive-id.zip"
def test_get_upload_files_by_document_id_for_zip_download_raises_for_missing_documents(self):
with patch.object(DocumentService, "get_documents_by_ids", return_value=[]):
with pytest.raises(NotFound, match="Document not found"):
DocumentService._get_upload_files_by_document_id_for_zip_download(
dataset_id="dataset-1",
document_ids=["doc-1"],
tenant_id="tenant-1",
)
def test_get_upload_files_by_document_id_for_zip_download_rejects_cross_tenant_access(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-1",
tenant_id="tenant-other",
data_source_info_dict={"upload_file_id": "file-1"},
)
with patch.object(DocumentService, "get_documents_by_ids", return_value=[document]):
with pytest.raises(Forbidden, match="No permission"):
DocumentService._get_upload_files_by_document_id_for_zip_download(
dataset_id="dataset-1",
document_ids=["doc-1"],
tenant_id="tenant-1",
)
def test_get_upload_files_by_document_id_for_zip_download_rejects_missing_upload_files(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-1",
tenant_id="tenant-1",
data_source_info_dict={"upload_file_id": "file-1"},
)
with (
patch.object(DocumentService, "get_documents_by_ids", return_value=[document]),
patch("services.dataset_service.FileService.get_upload_files_by_ids", return_value={}),
):
with pytest.raises(NotFound, match="Only uploaded-file documents can be downloaded as ZIP"):
DocumentService._get_upload_files_by_document_id_for_zip_download(
dataset_id="dataset-1",
document_ids=["doc-1"],
tenant_id="tenant-1",
)
def test_get_upload_files_by_document_id_for_zip_download_returns_document_keyed_mapping(self):
document_a = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-1",
tenant_id="tenant-1",
data_source_info_dict={"upload_file_id": "file-1"},
)
document_b = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-2",
tenant_id="tenant-1",
data_source_info_dict={"upload_file_id": "file-2"},
)
upload_file_a = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-1")
upload_file_b = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-2")
with (
patch.object(DocumentService, "get_documents_by_ids", return_value=[document_a, document_b]),
patch(
"services.dataset_service.FileService.get_upload_files_by_ids",
return_value={"file-1": upload_file_a, "file-2": upload_file_b},
),
):
result = DocumentService._get_upload_files_by_document_id_for_zip_download(
dataset_id="dataset-1",
document_ids=["doc-1", "doc-2"],
tenant_id="tenant-1",
)
assert result == {"doc-1": upload_file_a, "doc-2": upload_file_b}
def test_prepare_document_batch_download_zip_raises_not_found_for_missing_dataset(self):
user = DatasetServiceUnitDataFactory.create_user_mock()
with patch.object(DatasetService, "get_dataset", return_value=None):
with pytest.raises(NotFound, match="Dataset not found"):
DocumentService.prepare_document_batch_download_zip(
dataset_id="dataset-1",
document_ids=["doc-1"],
tenant_id="tenant-1",
current_user=user,
)
def test_prepare_document_batch_download_zip_translates_permission_error_to_forbidden(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock()
user = DatasetServiceUnitDataFactory.create_user_mock()
with (
patch.object(DatasetService, "get_dataset", return_value=dataset),
patch.object(DatasetService, "check_dataset_permission", side_effect=NoPermissionError("blocked")),
):
with pytest.raises(Forbidden, match="blocked"):
DocumentService.prepare_document_batch_download_zip(
dataset_id=dataset.id,
document_ids=["doc-1"],
tenant_id="tenant-1",
current_user=user,
)
def test_prepare_document_batch_download_zip_returns_upload_files_in_requested_order(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock()
user = DatasetServiceUnitDataFactory.create_user_mock()
upload_file_a = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-a")
upload_file_b = DatasetServiceUnitDataFactory.create_upload_file_mock(file_id="file-b")
with (
patch.object(DatasetService, "get_dataset", return_value=dataset),
patch.object(DatasetService, "check_dataset_permission"),
patch.object(
DocumentService,
"_get_upload_files_by_document_id_for_zip_download",
return_value={"doc-1": upload_file_a, "doc-2": upload_file_b},
),
patch.object(DocumentService, "_generate_document_batch_download_zip_filename", return_value="archive.zip"),
):
upload_files, download_name = DocumentService.prepare_document_batch_download_zip(
dataset_id=dataset.id,
document_ids=["doc-2", "doc-1"],
tenant_id="tenant-1",
current_user=user,
)
assert upload_files == [upload_file_b, upload_file_a]
assert download_name == "archive.zip"
def test_get_document_by_dataset_id_returns_enabled_documents(self):
document = DatasetServiceUnitDataFactory.create_document_mock(enabled=True)
with patch("services.dataset_service.db") as mock_db:
mock_db.session.scalars.return_value.all.return_value = [document]
result = DocumentService.get_document_by_dataset_id("dataset-1")
assert result == [document]
def test_get_working_documents_by_dataset_id_returns_scalars_result(self):
document = DatasetServiceUnitDataFactory.create_document_mock(indexing_status="completed", archived=False)
with patch("services.dataset_service.db") as mock_db:
mock_db.session.scalars.return_value.all.return_value = [document]
result = DocumentService.get_working_documents_by_dataset_id("dataset-1")
assert result == [document]
def test_get_error_documents_by_dataset_id_returns_scalars_result(self):
document = DatasetServiceUnitDataFactory.create_document_mock(indexing_status="error")
with patch("services.dataset_service.db") as mock_db:
mock_db.session.scalars.return_value.all.return_value = [document]
result = DocumentService.get_error_documents_by_dataset_id("dataset-1")
assert result == [document]
def test_get_batch_documents_filters_by_current_user_tenant(self):
class FakeAccount:
pass
current_user = FakeAccount()
current_user.current_tenant_id = "tenant-1"
document = DatasetServiceUnitDataFactory.create_document_mock()
with (
patch("services.dataset_service.Account", FakeAccount),
patch("services.dataset_service.current_user", current_user),
patch("services.dataset_service.db") as mock_db,
):
mock_db.session.scalars.return_value.all.return_value = [document]
result = DocumentService.get_batch_documents("dataset-1", "batch-1")
assert result == [document]
def test_get_document_file_detail_returns_one_or_none(self):
upload_file = DatasetServiceUnitDataFactory.create_upload_file_mock()
with patch("services.dataset_service.db") as mock_db:
mock_db.session.query.return_value.where.return_value.one_or_none.return_value = upload_file
result = DocumentService.get_document_file_detail(upload_file.id)
assert result is upload_file
class TestDocumentServiceMutations:
"""Unit tests for DocumentService mutation and orchestration helpers."""
@pytest.fixture
def rename_account_context(self):
class FakeAccount:
pass
current_user = FakeAccount()
current_user.id = "user-123"
current_user.current_tenant_id = "tenant-123"
with (
patch("services.dataset_service.Account", FakeAccount),
patch("services.dataset_service.current_user", current_user),
):
yield current_user
@pytest.mark.parametrize(("archived", "expected"), [(True, True), (False, False)])
def test_check_archived_returns_boolean_status(self, archived, expected):
document = DatasetServiceUnitDataFactory.create_document_mock(archived=archived)
assert DocumentService.check_archived(document) is expected
def test_delete_document_emits_signal_and_commits(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
data_source_type="upload_file",
data_source_info='{"upload_file_id": "file-1"}',
data_source_info_dict={"upload_file_id": "file-1"},
)
with (
patch("services.dataset_service.document_was_deleted.send") as send_deleted_signal,
patch("services.dataset_service.db") as mock_db,
):
DocumentService.delete_document(document)
send_deleted_signal.assert_called_once_with(
document.id,
dataset_id=document.dataset_id,
doc_form=document.doc_form,
file_id="file-1",
)
mock_db.session.delete.assert_called_once_with(document)
mock_db.session.commit.assert_called_once()
def test_delete_documents_ignores_empty_input(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock()
with patch("services.dataset_service.db") as mock_db:
DocumentService.delete_documents(dataset, [])
mock_db.session.scalars.assert_not_called()
def test_delete_documents_deletes_rows_and_dispatches_cleanup_task(self):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock(doc_form="text_model")
document_a = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-1",
data_source_type="upload_file",
data_source_info_dict={"upload_file_id": "file-1"},
)
document_b = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-2",
data_source_type="upload_file",
data_source_info_dict={"upload_file_id": "file-2"},
)
with (
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.batch_clean_document_task") as clean_task,
):
mock_db.session.scalars.return_value.all.return_value = [document_a, document_b]
DocumentService.delete_documents(dataset, ["doc-1", "doc-2"])
assert mock_db.session.delete.call_count == 2
mock_db.session.commit.assert_called_once()
clean_task.delay.assert_called_once_with(["doc-1", "doc-2"], dataset.id, dataset.doc_form, ["file-1", "file-2"])
def test_rename_document_raises_when_dataset_is_missing(self, rename_account_context):
with patch.object(DatasetService, "get_dataset", return_value=None):
with pytest.raises(ValueError, match="Dataset not found"):
DocumentService.rename_document("dataset-1", "doc-1", "New Name")
def test_rename_document_raises_when_document_is_missing(self, rename_account_context):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock()
with (
patch.object(DatasetService, "get_dataset", return_value=dataset),
patch.object(DocumentService, "get_document", return_value=None),
):
with pytest.raises(ValueError, match="Document not found"):
DocumentService.rename_document(dataset.id, "doc-1", "New Name")
def test_rename_document_rejects_cross_tenant_access(self, rename_account_context):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock()
document = DatasetServiceUnitDataFactory.create_document_mock(tenant_id="tenant-other")
with (
patch.object(DatasetService, "get_dataset", return_value=dataset),
patch.object(DocumentService, "get_document", return_value=document),
):
with pytest.raises(ValueError, match="No permission"):
DocumentService.rename_document(dataset.id, document.id, "New Name")
def test_rename_document_updates_document_metadata_and_upload_file_name(self, rename_account_context):
dataset = DatasetServiceUnitDataFactory.create_dataset_mock(
built_in_field_enabled=True,
tenant_id="tenant-1",
)
document = DatasetServiceUnitDataFactory.create_document_mock(
tenant_id="tenant-1",
doc_metadata={"title": "Old"},
data_source_info_dict={"upload_file_id": "file-1"},
)
rename_account_context.current_tenant_id = "tenant-1"
with (
patch.object(DatasetService, "get_dataset", return_value=dataset),
patch.object(DocumentService, "get_document", return_value=document),
patch("services.dataset_service.db") as mock_db,
):
result = DocumentService.rename_document(dataset.id, document.id, "New Name")
assert result is document
assert document.name == "New Name"
assert document.doc_metadata[BuiltInField.document_name] == "New Name"
mock_db.session.add.assert_called_once_with(document)
mock_db.session.query.return_value.where.return_value.update.assert_called_once()
mock_db.session.commit.assert_called_once()
def test_recover_document_raises_when_document_is_not_paused(self):
document = DatasetServiceUnitDataFactory.create_document_mock(is_paused=False)
with pytest.raises(DocumentIndexingError):
DocumentService.recover_document(document)
def test_retry_document_raises_when_retry_flag_is_already_set(self):
document = DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-1")
with patch("services.dataset_service.redis_client") as mock_redis:
mock_redis.get.return_value = "1"
with pytest.raises(ValueError, match="being retried"):
DocumentService.retry_document("dataset-1", [document])
def test_sync_website_document_raises_when_sync_flag_exists(self):
document = DatasetServiceUnitDataFactory.create_document_mock(document_id="doc-1")
with patch("services.dataset_service.redis_client") as mock_redis:
mock_redis.get.return_value = "1"
with pytest.raises(ValueError, match="being synced"):
DocumentService.sync_website_document("dataset-1", document)
def test_sync_website_document_updates_status_sets_cache_and_dispatches_task(self):
document = DatasetServiceUnitDataFactory.create_document_mock(
document_id="doc-1",
data_source_info_dict={"mode": "crawl"},
)
document.data_source_info = "{}"
with (
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.sync_website_document_indexing_task") as sync_task,
):
mock_redis.get.return_value = None
DocumentService.sync_website_document("dataset-1", document)
assert document.indexing_status == "waiting"
assert '"mode": "scrape"' in document.data_source_info
mock_db.session.add.assert_called_once_with(document)
mock_db.session.commit.assert_called_once()
mock_redis.setex.assert_called_once_with("document_doc-1_is_sync", 600, 1)
sync_task.delay.assert_called_once_with("dataset-1", "doc-1")
def test_get_documents_position_returns_next_position_when_documents_exist(self):
document = DatasetServiceUnitDataFactory.create_document_mock(position=7)
with patch("services.dataset_service.db") as mock_db:
mock_db.session.query.return_value.filter_by.return_value.order_by.return_value.first.return_value = (
document
)
result = DocumentService.get_documents_position("dataset-1")
assert result == 8
def test_get_documents_position_defaults_to_one_when_dataset_is_empty(self):
with patch("services.dataset_service.db") as mock_db:
mock_db.session.query.return_value.filter_by.return_value.order_by.return_value.first.return_value = None
result = DocumentService.get_documents_position("dataset-1")
assert result == 1
class TestDocumentServiceSaveDocumentWithoutDatasetId:
"""Unit tests for dataset creation around save_document_without_dataset_id."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with patch("services.dataset_service.current_user", account):
yield account
def test_save_document_without_dataset_id_creates_high_quality_dataset_with_default_retrieval_model(
self, account_context
):
knowledge_config = KnowledgeConfig(
indexing_technique="high_quality",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
embedding_model="embedding-model",
embedding_model_provider="provider",
summary_index_setting={"enable": True},
is_multimodal=True,
)
created_dataset = SimpleNamespace(
id="dataset-1",
tenant_id="tenant-1",
name="",
description=None,
)
first_document = SimpleNamespace(name="VeryLongDocumentNameForDataset.txt")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch(
"services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding",
return_value=SimpleNamespace(id="binding-1"),
),
patch(
"services.dataset_service.Dataset",
side_effect=lambda **kwargs: created_dataset.__dict__.update(kwargs) or created_dataset,
) as dataset_cls,
patch.object(
DocumentService, "save_document_with_dataset_id", return_value=([first_document], "batch-1")
) as save_document,
patch("services.dataset_service.db") as mock_db,
):
dataset, documents, batch = DocumentService.save_document_without_dataset_id(
tenant_id="tenant-1",
knowledge_config=knowledge_config,
account=account_context,
)
assert dataset is created_dataset
assert documents == [first_document]
assert batch == "batch-1"
assert created_dataset.collection_binding_id == "binding-1"
assert created_dataset.retrieval_model["search_method"] == RetrievalMethod.SEMANTIC_SEARCH
assert created_dataset.retrieval_model["top_k"] == 4
assert created_dataset.summary_index_setting == {"enable": True}
assert created_dataset.is_multimodal is True
assert created_dataset.name == first_document.name[:18] + "..."
assert (
created_dataset.description
== "useful for when you want to answer queries about the VeryLongDocumentNameForDataset.txt"
)
dataset_cls.assert_called_once()
save_document.assert_called_once_with(created_dataset, knowledge_config, account_context)
assert mock_db.session.commit.call_count == 1
def test_save_document_without_dataset_id_uses_provided_retrieval_model(self, account_context):
retrieval_model = RetrievalModel(
search_method=RetrievalMethod.SEMANTIC_SEARCH,
reranking_enable=True,
reranking_model=RerankingModel(
reranking_provider_name="rerank-provider",
reranking_model_name="rerank-model",
),
top_k=9,
score_threshold_enabled=True,
score_threshold=0.6,
)
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
retrieval_model=retrieval_model,
)
created_dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1", name="", description=None)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch(
"services.dataset_service.Dataset",
side_effect=lambda **kwargs: created_dataset.__dict__.update(kwargs) or created_dataset,
),
patch.object(
DocumentService,
"save_document_with_dataset_id",
return_value=([SimpleNamespace(name="Doc")], "batch-1"),
),
patch("services.dataset_service.db"),
):
DocumentService.save_document_without_dataset_id("tenant-1", knowledge_config, account_context)
assert created_dataset.retrieval_model == retrieval_model.model_dump()
assert created_dataset.collection_binding_id is None
def test_save_document_without_dataset_id_rejects_sandbox_batch_upload(self, account_context):
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1", "file-2"]),
)
),
)
with (
patch(
"services.dataset_service.FeatureService.get_features",
return_value=_make_features(enabled=True, plan=CloudPlan.SANDBOX),
),
patch.object(DocumentService, "check_documents_upload_quota") as check_quota,
):
with pytest.raises(ValueError, match="does not support batch upload"):
DocumentService.save_document_without_dataset_id("tenant-1", knowledge_config, account_context)
check_quota.assert_not_called()
class TestDocumentServiceUpdateDocumentWithDatasetId:
"""Unit tests for the document-update orchestration path."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with patch("services.dataset_service.current_user", account):
yield account
def test_update_document_with_dataset_id_raises_when_document_is_missing(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
)
with (
patch.object(DocumentService, "get_document", return_value=None),
patch.object(DatasetService, "check_dataset_model_setting") as check_model_setting,
):
with pytest.raises(NotFound, match="Document not found"):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
check_model_setting.assert_called_once_with(dataset)
def test_update_document_with_dataset_id_rejects_non_available_documents(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = SimpleNamespace(display_status="indexing")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch.object(DatasetService, "check_dataset_model_setting"),
):
with pytest.raises(ValueError, match="Document is not available"):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
def test_update_document_with_dataset_id_upload_file_process_rule_and_name_override(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = _make_document()
document.dataset_process_rule_id = "old-rule"
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
process_rule=ProcessRule(
mode="custom",
rules=Rule(
pre_processing_rules=[PreProcessingRule(id="remove_stopwords", enabled=True)],
segmentation=Segmentation(separator="\n", max_tokens=128),
),
),
name="Renamed document",
doc_form=IndexStructureType.QA_INDEX,
)
created_process_rule = SimpleNamespace(id="rule-2")
with (
patch.object(DocumentService, "get_document", return_value=document),
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.DatasetProcessRule", return_value=created_process_rule),
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.naive_utc_now", return_value="now"),
patch("services.dataset_service.document_indexing_update_task") as update_task,
):
upload_query = MagicMock()
upload_query.where.return_value.first.return_value = SimpleNamespace(id="file-1", name="upload.txt")
segment_query = MagicMock()
segment_query.filter_by.return_value.update.return_value = 3
mock_db.session.query.side_effect = [upload_query, segment_query]
result = DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
assert result is document
assert document.dataset_process_rule_id == "rule-2"
assert document.data_source_type == "upload_file"
assert document.data_source_info == '{"upload_file_id": "file-1"}'
assert document.name == "Renamed document"
assert document.indexing_status == "waiting"
assert document.completed_at is None
assert document.processing_started_at is None
assert document.parsing_completed_at is None
assert document.cleaning_completed_at is None
assert document.splitting_completed_at is None
assert document.updated_at == "now"
assert document.created_from == "web"
assert document.doc_form == IndexStructureType.QA_INDEX
assert mock_db.session.commit.call_count == 3
segment_query.filter_by.return_value.update.assert_called_once()
update_task.delay.assert_called_once_with(document.dataset_id, document.id)
def test_update_document_with_dataset_id_notion_import_requires_binding(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = SimpleNamespace(display_status="available", id="doc-1", dataset_id="dataset-1")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="notion_import",
notion_info_list=[
NotionInfo(
credential_id="credential-1",
workspace_id="workspace-1",
pages=[NotionPage(page_id="page-1", page_name="Page 1", page_icon=None, type="page")],
)
],
)
),
)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.db") as mock_db,
):
binding_query = MagicMock()
binding_query.where.return_value.first.return_value = None
mock_db.session.query.return_value = binding_query
with pytest.raises(ValueError, match="Data source binding not found"):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
def test_update_document_with_dataset_id_website_crawl_updates_segments_and_dispatches_task(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = _make_document()
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="website_crawl",
website_info_list=WebsiteInfo(
provider="firecrawl",
job_id="job-1",
urls=["https://example.com"],
only_main_content=False,
),
)
),
doc_form=IndexStructureType.PARENT_CHILD_INDEX,
)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.naive_utc_now", return_value="now"),
patch("services.dataset_service.document_indexing_update_task") as update_task,
):
segment_query = MagicMock()
segment_query.filter_by.return_value.update.return_value = 2
mock_db.session.query.return_value = segment_query
result = DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
assert result is document
assert document.data_source_type == "website_crawl"
assert document.data_source_info == (
'{"url": "https://example.com", "provider": "firecrawl", "job_id": "job-1", '
'"only_main_content": false, "mode": "crawl"}'
)
assert document.name == ""
assert document.doc_form == IndexStructureType.PARENT_CHILD_INDEX
segment_query.filter_by.return_value.update.assert_called_once()
update_task.delay.assert_called_once_with("dataset-1", "doc-1")
class TestDocumentServiceCreateValidation:
"""Unit tests for document creation validation helpers."""
def test_document_create_args_validate_requires_data_source_or_process_rule(self):
knowledge_config = SimpleNamespace(data_source=None, process_rule=None)
with pytest.raises(ValueError, match="Data source or Process rule is required"):
DocumentService.document_create_args_validate(knowledge_config)
def test_document_create_args_validate_delegates_to_sub_validators(self):
knowledge_config = SimpleNamespace(data_source=object(), process_rule=object())
with (
patch.object(DocumentService, "data_source_args_validate") as validate_data_source,
patch.object(DocumentService, "process_rule_args_validate") as validate_process_rule,
):
DocumentService.document_create_args_validate(knowledge_config)
validate_data_source.assert_called_once_with(knowledge_config)
validate_process_rule.assert_called_once_with(knowledge_config)
def test_data_source_args_validate_rejects_invalid_type(self):
knowledge_config = SimpleNamespace(
data_source=SimpleNamespace(
info_list=SimpleNamespace(
data_source_type="bad-source",
file_info_list=None,
notion_info_list=None,
website_info_list=None,
)
)
)
with pytest.raises(ValueError, match="Data source type is invalid"):
DocumentService.data_source_args_validate(knowledge_config)
@pytest.mark.parametrize(
("data_source_type", "field_name", "message"),
[
("upload_file", "file_info_list", "File source info is required"),
("notion_import", "notion_info_list", "Notion source info is required"),
("website_crawl", "website_info_list", "Website source info is required"),
],
)
def test_data_source_args_validate_requires_source_specific_info(self, data_source_type, field_name, message):
info_list = SimpleNamespace(
data_source_type=data_source_type,
file_info_list=object(),
notion_info_list=object(),
website_info_list=object(),
)
setattr(info_list, field_name, None)
knowledge_config = SimpleNamespace(data_source=SimpleNamespace(info_list=info_list))
with pytest.raises(ValueError, match=message):
DocumentService.data_source_args_validate(knowledge_config)
def test_process_rule_args_validate_clears_rules_for_automatic_mode(self):
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
process_rule=ProcessRule(
mode="automatic",
rules=Rule(
pre_processing_rules=[PreProcessingRule(id="remove_stopwords", enabled=True)],
segmentation=Segmentation(separator="\n", max_tokens=128),
),
),
)
DocumentService.process_rule_args_validate(knowledge_config)
assert knowledge_config.process_rule is not None
assert knowledge_config.process_rule.rules is None
def test_process_rule_args_validate_deduplicates_rules_and_skips_max_tokens_for_full_doc_hierarchical(self):
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
process_rule=ProcessRule(
mode="hierarchical",
rules=Rule(
pre_processing_rules=[
PreProcessingRule(id="remove_stopwords", enabled=True),
PreProcessingRule(id="remove_stopwords", enabled=False),
],
segmentation=Segmentation(separator="\n", max_tokens=0),
parent_mode="full-doc",
),
),
)
DocumentService.process_rule_args_validate(knowledge_config)
assert knowledge_config.process_rule is not None
assert knowledge_config.process_rule.rules is not None
assert len(knowledge_config.process_rule.rules.pre_processing_rules) == 1
assert knowledge_config.process_rule.rules.pre_processing_rules[0].enabled is False
class TestDocumentServiceSaveDocumentWithDatasetId:
"""Unit tests for non-SQL validation branches in save_document_with_dataset_id."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with (
patch("services.dataset_service.current_user", account),
patch.object(DatasetService, "check_doc_form"),
):
yield account
def test_save_document_with_dataset_id_requires_file_info_for_upload_source(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(file_ids=None)
with patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=True)):
with pytest.raises(ValueError, match="File source info is required"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_blocks_batch_upload_for_sandbox_plan(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1", "file-2"])
with (
patch(
"services.dataset_service.FeatureService.get_features",
return_value=_make_features(enabled=True, plan=CloudPlan.SANDBOX),
),
patch.object(DocumentService, "check_documents_upload_quota") as check_quota,
):
with pytest.raises(ValueError, match="does not support batch upload"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
check_quota.assert_not_called()
def test_save_document_with_dataset_id_enforces_batch_upload_limit(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1", "file-2"])
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=True)),
patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", 1),
patch.object(DocumentService, "check_documents_upload_quota") as check_quota,
):
with pytest.raises(ValueError, match="batch upload limit of 1"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
check_quota.assert_not_called()
def test_save_document_with_dataset_id_updates_existing_document_and_data_source_type(self, account_context):
dataset = _make_dataset(data_source_type=None)
knowledge_config = _make_upload_knowledge_config(original_document_id="doc-1", file_ids=["file-1"])
updated_document = _make_document(document_id="doc-1", batch="batch-existing")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch.object(
DocumentService, "update_document_with_dataset_id", return_value=updated_document
) as update_document,
):
documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert dataset.data_source_type == "upload_file"
assert documents == [updated_document]
assert batch == "batch-existing"
update_document.assert_called_once_with(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_requires_data_source_for_new_documents(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(data_source=None)
with patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)):
with pytest.raises(ValueError, match="Data source is required when creating new documents"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_requires_existing_process_rule_for_custom_mode(self, account_context):
dataset = _make_dataset(latest_process_rule=None)
knowledge_config = _make_upload_knowledge_config(
file_ids=["file-1"],
process_rule=ProcessRule(mode="custom"),
)
with patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)):
with pytest.raises(ValueError, match="No process rule found"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_rejects_invalid_indexing_technique(self, account_context):
dataset = _make_dataset(indexing_technique=None)
knowledge_config = SimpleNamespace(
doc_form=IndexStructureType.PARAGRAPH_INDEX,
original_document_id=None,
data_source=None,
indexing_technique="broken-technique",
)
with patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)):
with pytest.raises(ValueError, match="Indexing technique is invalid"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_returns_empty_for_invalid_process_rule_mode(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1"])
knowledge_config.process_rule = SimpleNamespace(mode="unsupported-mode", rules=None)
with patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)):
documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert documents == []
assert batch == ""
def test_save_document_with_dataset_id_upload_file_creates_and_reindexes_documents(self, account_context):
dataset = _make_dataset()
dataset_process_rule = SimpleNamespace(id="rule-1")
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1", "file-2"])
duplicate_document = _make_document(document_id="doc-duplicate", name="existing.txt")
created_document = _make_document(document_id="doc-created", name="new.txt")
upload_file_a = SimpleNamespace(id="file-1", name="existing.txt")
upload_file_b = SimpleNamespace(id="file-2", name="new.txt")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch.object(DocumentService, "get_documents_position", return_value=4),
patch.object(DocumentService, "build_document", return_value=created_document) as build_document,
patch("services.dataset_service.DocumentIndexingTaskProxy") as document_proxy_cls,
patch("services.dataset_service.DuplicateDocumentIndexingTaskProxy") as duplicate_proxy_cls,
patch("services.dataset_service.naive_utc_now", return_value="now"),
patch("services.dataset_service.time.strftime", return_value="20260101010101"),
patch("services.dataset_service.secrets.randbelow", return_value=23),
):
mock_redis.lock.return_value = _make_lock_context()
upload_query = MagicMock()
upload_query.where.return_value.all.return_value = [upload_file_a, upload_file_b]
existing_documents_query = MagicMock()
existing_documents_query.where.return_value.all.return_value = [duplicate_document]
mock_db.session.query.side_effect = [upload_query, existing_documents_query]
documents, batch = DocumentService.save_document_with_dataset_id(
dataset,
knowledge_config,
account_context,
dataset_process_rule=dataset_process_rule,
)
assert documents == [duplicate_document, created_document]
assert batch == "20260101010101100023"
assert duplicate_document.dataset_process_rule_id == "rule-1"
assert duplicate_document.updated_at == "now"
assert duplicate_document.batch == batch
assert duplicate_document.indexing_status == "waiting"
build_document.assert_called_once_with(
dataset,
"rule-1",
"upload_file",
IndexStructureType.PARAGRAPH_INDEX,
"English",
{"upload_file_id": "file-2"},
"web",
4,
account_context,
"new.txt",
batch,
)
document_proxy_cls.assert_called_once_with(dataset.tenant_id, dataset.id, ["doc-created"])
document_proxy_cls.return_value.delay.assert_called_once()
duplicate_proxy_cls.assert_called_once_with(dataset.tenant_id, dataset.id, ["doc-duplicate"])
duplicate_proxy_cls.return_value.delay.assert_called_once()
def test_save_document_with_dataset_id_notion_import_truncates_names_and_cleans_removed_pages(
self, account_context
):
dataset = _make_dataset()
dataset_process_rule = SimpleNamespace(id="rule-1")
notion_page_name = "a" * 300
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="notion_import",
notion_info_list=[
NotionInfo(
credential_id="credential-1",
workspace_id="workspace-1",
pages=[
NotionPage(page_id="page-keep", page_name="Keep page", type="page"),
NotionPage(
page_id="page-new",
page_name=notion_page_name,
page_icon=NotionIcon(type="emoji", emoji="page"),
type="page",
),
],
)
],
)
),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
doc_language="English",
)
existing_keep = _make_document(document_id="doc-keep")
existing_keep.data_source_info = json.dumps({"notion_page_id": "page-keep"})
existing_remove = _make_document(document_id="doc-remove")
existing_remove.data_source_info = json.dumps({"notion_page_id": "page-remove"})
created_document = _make_document(document_id="doc-new")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch.object(DocumentService, "get_documents_position", return_value=1),
patch.object(DocumentService, "build_document", return_value=created_document) as build_document,
patch("services.dataset_service.clean_notion_document_task") as clean_task,
patch("services.dataset_service.DocumentIndexingTaskProxy") as document_proxy_cls,
):
mock_redis.lock.return_value = _make_lock_context()
notion_documents_query = MagicMock()
notion_documents_query.filter_by.return_value.all.return_value = [existing_keep, existing_remove]
mock_db.session.query.return_value = notion_documents_query
documents, _ = DocumentService.save_document_with_dataset_id(
dataset,
knowledge_config,
account_context,
dataset_process_rule=dataset_process_rule,
)
assert created_document in documents
assert len(build_document.call_args.args[9]) == 255
clean_task.delay.assert_called_once_with(["doc-remove"], dataset.id)
document_proxy_cls.assert_called_once_with(dataset.tenant_id, dataset.id, ["doc-new"])
document_proxy_cls.return_value.delay.assert_called_once()
def test_save_document_with_dataset_id_website_crawl_truncates_long_urls(self, account_context):
dataset = _make_dataset()
dataset_process_rule = SimpleNamespace(id="rule-1")
long_url = "https://example.com/" + ("a" * 260)
short_url = "https://example.com/short"
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="website_crawl",
website_info_list=WebsiteInfo(
provider="firecrawl",
job_id="job-1",
urls=[long_url, short_url],
only_main_content=True,
),
)
),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
doc_language="English",
)
first_document = _make_document(document_id="doc-1")
second_document = _make_document(document_id="doc-2")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch.object(DocumentService, "get_documents_position", return_value=2),
patch.object(
DocumentService,
"build_document",
side_effect=[first_document, second_document],
) as build_document,
patch("services.dataset_service.DocumentIndexingTaskProxy") as document_proxy_cls,
):
mock_redis.lock.return_value = _make_lock_context()
documents, _ = DocumentService.save_document_with_dataset_id(
dataset,
knowledge_config,
account_context,
dataset_process_rule=dataset_process_rule,
)
assert documents == [first_document, second_document]
assert build_document.call_args_list[0].args[9] == long_url[:200] + "..."
assert build_document.call_args_list[1].args[9] == short_url
document_proxy_cls.assert_called_once_with(dataset.tenant_id, dataset.id, ["doc-1", "doc-2"])
document_proxy_cls.return_value.delay.assert_called_once()
class TestDocumentServiceBatchUpdateStatus:
"""Unit tests for batch_update_document_status orchestration and helper branches."""
def test_prepare_disable_update_requires_completed_document(self):
document = _make_document(indexing_status="waiting")
document.completed_at = None
with pytest.raises(DocumentIndexingError, match="is not completed"):
DocumentService._prepare_disable_update(document, user=SimpleNamespace(id="user-1"), now="now")
def test_prepare_archive_update_sets_async_task_for_enabled_document(self):
document = _make_document(enabled=True, archived=False)
result = DocumentService._prepare_archive_update(document, user=SimpleNamespace(id="user-1"), now="now")
assert result is not None
assert result["updates"]["archived"] is True
assert result["set_cache"] is True
assert result["async_task"]["args"] == [document.id]
def test_prepare_unarchive_update_sets_async_task_for_enabled_document(self):
document = _make_document(enabled=True, archived=True)
result = DocumentService._prepare_unarchive_update(document, now="now")
assert result is not None
assert result["updates"]["archived"] is False
assert result["set_cache"] is True
assert result["async_task"]["args"] == [document.id]
def test_batch_update_document_status_rejects_indexing_documents(self):
dataset = _make_dataset()
document = _make_document(name="Busy document")
with (
patch.object(DocumentService, "get_document", return_value=document),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
):
mock_redis.get.return_value = "1"
with pytest.raises(DocumentIndexingError, match="Busy document is being indexed"):
DocumentService.batch_update_document_status(
dataset, [document.id], "archive", SimpleNamespace(id="user-1")
)
mock_db.session.commit.assert_not_called()
def test_batch_update_document_status_rolls_back_when_commit_fails(self):
dataset = _make_dataset()
document = _make_document(enabled=False)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
):
mock_redis.get.return_value = None
mock_db.session.commit.side_effect = RuntimeError("commit failed")
with pytest.raises(RuntimeError, match="commit failed"):
DocumentService.batch_update_document_status(
dataset, [document.id], "enable", SimpleNamespace(id="user-1")
)
mock_db.session.rollback.assert_called_once()
def test_batch_update_document_status_raises_async_task_error_after_commit(self):
dataset = _make_dataset()
document = _make_document(enabled=False)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.add_document_to_index_task") as add_task,
):
mock_redis.get.return_value = None
add_task.delay.side_effect = RuntimeError("task failed")
with pytest.raises(RuntimeError, match="task failed"):
DocumentService.batch_update_document_status(
dataset, [document.id], "enable", SimpleNamespace(id="user-1")
)
mock_db.session.commit.assert_called_once()
mock_redis.setex.assert_called_once_with(f"document_{document.id}_indexing", 600, 1)
class TestDocumentServiceTenantAndUpdateEdges:
"""Unit tests for tenant-count and update edge cases."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with patch("services.dataset_service.current_user", account):
yield account
def test_get_tenant_documents_count_returns_query_count(self, account_context):
with patch("services.dataset_service.db") as mock_db:
mock_db.session.query.return_value.where.return_value.count.return_value = 12
result = DocumentService.get_tenant_documents_count()
assert result == 12
mock_db.session.query.return_value.where.return_value.count.assert_called_once()
def test_update_document_with_dataset_id_uses_automatic_process_rule_payload(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = _make_document()
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
process_rule=ProcessRule(
mode="automatic",
rules=Rule(
pre_processing_rules=[PreProcessingRule(id="remove_stopwords", enabled=True)],
segmentation=Segmentation(separator="\n", max_tokens=128),
),
),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
)
created_process_rule = SimpleNamespace(id="rule-2")
with (
patch.object(DocumentService, "get_document", return_value=document),
patch("services.dataset_service.DatasetProcessRule") as process_rule_cls,
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.naive_utc_now", return_value="now"),
patch("services.dataset_service.document_indexing_update_task") as update_task,
):
process_rule_cls.AUTOMATIC_RULES = DatasetProcessRule.AUTOMATIC_RULES
process_rule_cls.return_value = created_process_rule
upload_query = MagicMock()
upload_query.where.return_value.first.return_value = SimpleNamespace(id="file-1", name="upload.txt")
segment_query = MagicMock()
segment_query.filter_by.return_value.update.return_value = 1
mock_db.session.query.side_effect = [upload_query, segment_query]
result = DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
assert result is document
assert document.dataset_process_rule_id == "rule-2"
assert document.name == "upload.txt"
assert process_rule_cls.call_args.kwargs == {
"dataset_id": "dataset-1",
"mode": "automatic",
"rules": json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
"created_by": "user-1",
}
assert mock_db.session.commit.call_count == 3
update_task.delay.assert_called_once_with("dataset-1", "doc-1")
def test_update_document_with_dataset_id_requires_upload_file_info(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(info_list=InfoList(data_source_type="upload_file")),
)
with (
patch.object(DocumentService, "get_document", return_value=_make_document()),
patch.object(DatasetService, "check_dataset_model_setting"),
):
with pytest.raises(ValueError, match="No file info list found"):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
def test_update_document_with_dataset_id_raises_when_upload_file_is_missing(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=["file-1"]),
)
),
)
with (
patch.object(DocumentService, "get_document", return_value=_make_document()),
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.db") as mock_db,
):
mock_db.session.query.return_value.where.return_value.first.return_value = None
with pytest.raises(FileNotExistsError):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
def test_update_document_with_dataset_id_requires_notion_info_list(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(info_list=InfoList(data_source_type="notion_import")),
)
with (
patch.object(DocumentService, "get_document", return_value=_make_document()),
patch.object(DatasetService, "check_dataset_model_setting"),
):
with pytest.raises(ValueError, match="No notion info list found"):
DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
def test_update_document_with_dataset_id_notion_import_updates_page_info(self, account_context):
dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1")
document = _make_document()
document_data = KnowledgeConfig(
original_document_id="doc-1",
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="notion_import",
notion_info_list=[
NotionInfo(
credential_id="credential-1",
workspace_id="workspace-1",
pages=[
NotionPage(page_id="page-1", page_name="Page 1", page_icon=None, type="page"),
NotionPage(page_id="page-2", page_name="Page 2", page_icon=None, type="database"),
],
)
],
)
),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
)
with (
patch.object(DocumentService, "get_document", return_value=document),
patch.object(DatasetService, "check_dataset_model_setting"),
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.naive_utc_now", return_value="now"),
patch("services.dataset_service.document_indexing_update_task") as update_task,
):
binding_query = MagicMock()
binding_query.where.return_value.first.return_value = SimpleNamespace(id="binding-1")
segment_query = MagicMock()
segment_query.filter_by.return_value.update.return_value = 1
mock_db.session.query.side_effect = [binding_query, segment_query]
result = DocumentService.update_document_with_dataset_id(dataset, document_data, account_context)
assert result is document
assert document.data_source_type == "notion_import"
assert document.name == ""
assert document.data_source_info == json.dumps(
{
"credential_id": "credential-1",
"notion_workspace_id": "workspace-1",
"notion_page_id": "page-2",
"notion_page_icon": None,
"type": "database",
}
)
update_task.delay.assert_called_once_with("dataset-1", "doc-1")
class TestDocumentServiceSaveWithoutDatasetBilling:
"""Unit tests for batch-count and quota branches in save_document_without_dataset_id."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with patch("services.dataset_service.current_user", account):
yield account
def test_save_document_without_dataset_id_counts_notion_pages_for_quota(self, account_context):
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="notion_import",
notion_info_list=[
NotionInfo(
credential_id="credential-1",
workspace_id="workspace-1",
pages=[
NotionPage(page_id="page-1", page_name="Page 1", page_icon=None, type="page"),
NotionPage(page_id="page-2", page_name="Page 2", page_icon=None, type="page"),
],
),
NotionInfo(
credential_id="credential-2",
workspace_id="workspace-2",
pages=[NotionPage(page_id="page-3", page_name="Page 3", page_icon=None, type="page")],
),
],
)
),
)
created_dataset = SimpleNamespace(id="dataset-1", tenant_id="tenant-1", name="", description=None)
features = _make_features(enabled=True)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=features),
patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", "10"),
patch.object(DocumentService, "check_documents_upload_quota") as check_quota,
patch(
"services.dataset_service.Dataset",
side_effect=lambda **kwargs: created_dataset.__dict__.update(kwargs) or created_dataset,
),
patch.object(
DocumentService,
"save_document_with_dataset_id",
return_value=([SimpleNamespace(name="Doc")], "batch-1"),
),
patch("services.dataset_service.db"),
):
DocumentService.save_document_without_dataset_id("tenant-1", knowledge_config, account_context)
check_quota.assert_called_once_with(3, features)
def test_save_document_without_dataset_id_enforces_batch_limit_for_website_urls(self, account_context):
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(
info_list=InfoList(
data_source_type="website_crawl",
website_info_list=WebsiteInfo(
provider="firecrawl",
job_id="job-1",
urls=["https://example.com/a", "https://example.com/b"],
only_main_content=True,
),
)
),
)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=True)),
patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", "1"),
patch.object(DocumentService, "check_documents_upload_quota") as check_quota,
):
with pytest.raises(ValueError, match="batch upload limit of 1"):
DocumentService.save_document_without_dataset_id("tenant-1", knowledge_config, account_context)
check_quota.assert_not_called()
class TestDocumentServiceEstimateValidation:
"""Unit tests for estimate_args_validate branches."""
def test_estimate_args_validate_rejects_missing_info_list(self):
with pytest.raises(ValueError, match="Data source info is required"):
DocumentService.estimate_args_validate({})
def test_estimate_args_validate_sets_empty_rules_for_automatic_mode(self):
args = {
"info_list": {"data_source_type": "upload_file"},
"process_rule": {"mode": "automatic", "rules": {"ignored": True}},
}
DocumentService.estimate_args_validate(args)
assert args["process_rule"]["rules"] == {}
def test_estimate_args_validate_rejects_unknown_pre_processing_rule_id(self):
args = {
"info_list": {"data_source_type": "upload_file"},
"process_rule": {
"mode": "custom",
"rules": {
"pre_processing_rules": [{"id": "unknown", "enabled": True}],
"segmentation": {"separator": "\n", "max_tokens": 128},
},
},
}
with pytest.raises(ValueError, match="pre_processing_rules id is invalid"):
DocumentService.estimate_args_validate(args)
def test_estimate_args_validate_deduplicates_rules_for_custom_mode(self):
args = {
"info_list": {"data_source_type": "upload_file"},
"process_rule": {
"mode": "custom",
"rules": {
"pre_processing_rules": [
{"id": "remove_stopwords", "enabled": True},
{"id": "remove_stopwords", "enabled": False},
],
"segmentation": {"separator": "\n", "max_tokens": 128},
},
},
}
DocumentService.estimate_args_validate(args)
assert args["process_rule"]["rules"]["pre_processing_rules"] == [{"id": "remove_stopwords", "enabled": False}]
def test_estimate_args_validate_requires_summary_index_provider_name(self):
args = {
"info_list": {"data_source_type": "upload_file"},
"process_rule": {
"mode": "custom",
"rules": {
"pre_processing_rules": [{"id": "remove_stopwords", "enabled": True}],
"segmentation": {"separator": "\n", "max_tokens": 128},
},
"summary_index_setting": {"enable": True, "model_name": "summary-model"},
},
}
with pytest.raises(ValueError, match="Summary index model provider name is required"):
DocumentService.estimate_args_validate(args)
class TestDocumentServiceSaveDocumentAdditionalBranches:
"""Additional unit tests for dataset bootstrap and process-rule branches."""
@pytest.fixture
def account_context(self):
account = create_autospec(Account, instance=True)
account.id = "user-1"
account.current_tenant_id = "tenant-1"
with (
patch("services.dataset_service.current_user", account),
patch.object(DatasetService, "check_doc_form"),
):
yield account
def test_save_document_with_dataset_id_initializes_high_quality_dataset_from_default_embedding_model(
self, account_context
):
dataset = _make_dataset(data_source_type=None, indexing_technique=None)
knowledge_config = _make_upload_knowledge_config(original_document_id="doc-1", file_ids=["file-1"])
knowledge_config.indexing_technique = "high_quality"
knowledge_config.embedding_model = None
knowledge_config.embedding_model_provider = None
updated_document = _make_document(batch="batch-existing")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.ModelManager") as model_manager_cls,
patch(
"services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding",
return_value=SimpleNamespace(id="binding-1"),
) as get_binding,
patch.object(DocumentService, "update_document_with_dataset_id", return_value=updated_document),
):
model_manager_cls.return_value.get_default_model_instance.return_value = SimpleNamespace(
model_name="default-embedding",
provider="default-provider",
)
documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert documents == [updated_document]
assert batch == "batch-existing"
assert dataset.data_source_type == "upload_file"
assert dataset.indexing_technique == "high_quality"
assert dataset.embedding_model == "default-embedding"
assert dataset.embedding_model_provider == "default-provider"
assert dataset.collection_binding_id == "binding-1"
assert dataset.retrieval_model == {
"search_method": "semantic_search",
"reranking_enable": False,
"reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
"top_k": 4,
"score_threshold_enabled": False,
}
get_binding.assert_called_once_with("default-provider", "default-embedding")
def test_save_document_with_dataset_id_uses_explicit_embedding_and_retrieval_model(self, account_context):
dataset = _make_dataset(indexing_technique=None)
knowledge_config = _make_upload_knowledge_config(original_document_id="doc-1", file_ids=["file-1"])
knowledge_config.indexing_technique = "high_quality"
knowledge_config.embedding_model = "explicit-model"
knowledge_config.embedding_model_provider = "explicit-provider"
knowledge_config.retrieval_model = RetrievalModel(
search_method="semantic_search",
reranking_enable=True,
reranking_model=RerankingModel(
reranking_provider_name="rerank-provider",
reranking_model_name="rerank-model",
),
top_k=7,
score_threshold_enabled=True,
score_threshold=0.3,
)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.ModelManager") as model_manager_cls,
patch(
"services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding",
return_value=SimpleNamespace(id="binding-2"),
) as get_binding,
patch.object(DocumentService, "update_document_with_dataset_id", return_value=_make_document()),
):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
model_manager_cls.return_value.get_default_model_instance.assert_not_called()
get_binding.assert_called_once_with("explicit-provider", "explicit-model")
assert dataset.embedding_model == "explicit-model"
assert dataset.embedding_model_provider == "explicit-provider"
assert dataset.retrieval_model == knowledge_config.retrieval_model.model_dump()
def test_save_document_with_dataset_id_creates_custom_process_rule_for_new_upload_document(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(
file_ids=["file-1"],
process_rule=ProcessRule(
mode="custom",
rules=Rule(
pre_processing_rules=[PreProcessingRule(id="remove_stopwords", enabled=True)],
segmentation=Segmentation(separator="\n", max_tokens=128),
),
),
)
created_process_rule = SimpleNamespace(id="rule-custom")
created_document = _make_document(document_id="doc-created", name="file.txt")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.DatasetProcessRule") as process_rule_cls,
patch.object(DocumentService, "get_documents_position", return_value=3),
patch.object(DocumentService, "build_document", return_value=created_document),
patch("services.dataset_service.DocumentIndexingTaskProxy") as document_proxy_cls,
patch("services.dataset_service.time.strftime", return_value="20260101010101"),
patch("services.dataset_service.secrets.randbelow", return_value=23),
):
mock_redis.lock.return_value = _make_lock_context()
process_rule_cls.return_value = created_process_rule
upload_query = MagicMock()
upload_query.where.return_value.all.return_value = [SimpleNamespace(id="file-1", name="file.txt")]
existing_documents_query = MagicMock()
existing_documents_query.where.return_value.all.return_value = []
mock_db.session.query.side_effect = [upload_query, existing_documents_query]
documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert documents == [created_document]
assert batch == "20260101010101100023"
assert process_rule_cls.call_args.kwargs == {
"dataset_id": "dataset-1",
"mode": "custom",
"rules": knowledge_config.process_rule.rules.model_dump_json(),
"created_by": "user-1",
}
document_proxy_cls.assert_called_once_with("tenant-1", "dataset-1", ["doc-created"])
document_proxy_cls.return_value.delay.assert_called_once()
def test_save_document_with_dataset_id_creates_automatic_process_rule_for_new_upload_document(
self, account_context
):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(
file_ids=["file-1"],
process_rule=ProcessRule(mode="automatic"),
)
created_process_rule = SimpleNamespace(id="rule-auto")
created_document = _make_document(document_id="doc-created", name="file.txt")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.DatasetProcessRule") as process_rule_cls,
patch.object(DocumentService, "get_documents_position", return_value=1),
patch.object(DocumentService, "build_document", return_value=created_document),
patch("services.dataset_service.DocumentIndexingTaskProxy"),
patch("services.dataset_service.time.strftime", return_value="20260101010101"),
patch("services.dataset_service.secrets.randbelow", return_value=23),
):
mock_redis.lock.return_value = _make_lock_context()
process_rule_cls.AUTOMATIC_RULES = DatasetProcessRule.AUTOMATIC_RULES
process_rule_cls.return_value = created_process_rule
upload_query = MagicMock()
upload_query.where.return_value.all.return_value = [SimpleNamespace(id="file-1", name="file.txt")]
existing_documents_query = MagicMock()
existing_documents_query.where.return_value.all.return_value = []
mock_db.session.query.side_effect = [upload_query, existing_documents_query]
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert process_rule_cls.call_args.kwargs == {
"dataset_id": "dataset-1",
"mode": "automatic",
"rules": json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
"created_by": "user-1",
}
assert mock_db.session.flush.call_count >= 2
def test_save_document_with_dataset_id_creates_fallback_automatic_process_rule_when_latest_is_missing(
self, account_context
):
dataset = _make_dataset(latest_process_rule=None)
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1"], process_rule=None)
created_process_rule = SimpleNamespace(id="rule-fallback")
created_document = _make_document(document_id="doc-created", name="file.txt")
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch("services.dataset_service.DatasetProcessRule") as process_rule_cls,
patch.object(DocumentService, "get_documents_position", return_value=1),
patch.object(DocumentService, "build_document", return_value=created_document),
patch("services.dataset_service.DocumentIndexingTaskProxy"),
patch("services.dataset_service.time.strftime", return_value="20260101010101"),
patch("services.dataset_service.secrets.randbelow", return_value=23),
):
mock_redis.lock.return_value = _make_lock_context()
process_rule_cls.AUTOMATIC_RULES = DatasetProcessRule.AUTOMATIC_RULES
process_rule_cls.return_value = created_process_rule
upload_query = MagicMock()
upload_query.where.return_value.all.return_value = [SimpleNamespace(id="file-1", name="file.txt")]
existing_documents_query = MagicMock()
existing_documents_query.where.return_value.all.return_value = []
mock_db.session.query.side_effect = [upload_query, existing_documents_query]
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
assert process_rule_cls.call_args.kwargs == {
"dataset_id": "dataset-1",
"mode": "automatic",
"rules": json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
"created_by": "user-1",
}
def test_save_document_with_dataset_id_raises_when_upload_file_lookup_is_incomplete(self, account_context):
dataset = _make_dataset()
knowledge_config = _make_upload_knowledge_config(file_ids=["file-1", "file-2"])
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch("services.dataset_service.db") as mock_db,
patch.object(DocumentService, "get_documents_position", return_value=1),
patch("services.dataset_service.time.strftime", return_value="20260101010101"),
patch("services.dataset_service.secrets.randbelow", return_value=23),
):
mock_redis.lock.return_value = _make_lock_context()
upload_query = MagicMock()
upload_query.where.return_value.all.return_value = [SimpleNamespace(id="file-1", name="file.txt")]
mock_db.session.query.return_value = upload_query
with pytest.raises(FileNotExistsError, match="One or more files not found"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account_context)
def test_save_document_with_dataset_id_requires_notion_info_list_for_notion_import(self, account_context):
dataset = _make_dataset()
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(info_list=InfoList(data_source_type="notion_import")),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
doc_language="English",
)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch.object(DocumentService, "get_documents_position", return_value=1),
):
mock_redis.lock.return_value = _make_lock_context()
with pytest.raises(ValueError, match="No notion info list found"):
DocumentService.save_document_with_dataset_id(
dataset,
knowledge_config,
account_context,
dataset_process_rule=SimpleNamespace(id="rule-1"),
)
def test_save_document_with_dataset_id_requires_website_info_list_for_website_crawl(self, account_context):
dataset = _make_dataset()
knowledge_config = KnowledgeConfig(
indexing_technique="economy",
data_source=DataSource(info_list=InfoList(data_source_type="website_crawl")),
doc_form=IndexStructureType.PARAGRAPH_INDEX,
doc_language="English",
)
with (
patch("services.dataset_service.FeatureService.get_features", return_value=_make_features(enabled=False)),
patch("services.dataset_service.redis_client") as mock_redis,
patch.object(DocumentService, "get_documents_position", return_value=1),
):
mock_redis.lock.return_value = _make_lock_context()
with pytest.raises(ValueError, match="No website info list found"):
DocumentService.save_document_with_dataset_id(
dataset,
knowledge_config,
account_context,
dataset_process_rule=SimpleNamespace(id="rule-1"),
)