Files
dify/api/tests/unit_tests/services/test_metadata_service.py

559 lines
20 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from datetime import UTC, datetime
from types import SimpleNamespace
from typing import Any, cast
from unittest.mock import MagicMock
import pytest
from pytest_mock import MockerFixture
from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
from models.dataset import Dataset
from services.entities.knowledge_entities.knowledge_entities import (
DocumentMetadataOperation,
MetadataArgs,
MetadataDetail,
MetadataOperationData,
)
from services.metadata_service import MetadataService
@dataclass
class _DocumentStub:
id: str
name: str
uploader: str
upload_date: datetime
last_update_date: datetime
data_source_type: str
doc_metadata: dict[str, object] | None
@pytest.fixture
def mock_db(mocker: MockerFixture) -> MagicMock:
mocked_db = mocker.patch("services.metadata_service.db")
mocked_db.session = MagicMock()
return mocked_db
@pytest.fixture
def mock_redis_client(mocker: MockerFixture) -> MagicMock:
return mocker.patch("services.metadata_service.redis_client")
@pytest.fixture
def mock_current_account(mocker: MockerFixture) -> MagicMock:
mock_user = SimpleNamespace(id="user-1")
return mocker.patch("services.metadata_service.current_account_with_tenant", return_value=(mock_user, "tenant-1"))
def _build_document(document_id: str, doc_metadata: dict[str, object] | None = None) -> _DocumentStub:
now = datetime(2025, 1, 1, 10, 30, tzinfo=UTC)
return _DocumentStub(
id=document_id,
name=f"doc-{document_id}",
uploader="qa@example.com",
upload_date=now,
last_update_date=now,
data_source_type="upload_file",
doc_metadata=doc_metadata,
)
def _dataset(**kwargs: Any) -> Dataset:
return cast(Dataset, SimpleNamespace(**kwargs))
def test_create_metadata_should_raise_value_error_when_name_exceeds_limit() -> None:
# Arrange
metadata_args = MetadataArgs(type="string", name="x" * 256)
# Act + Assert
with pytest.raises(ValueError, match="cannot exceed 255"):
MetadataService.create_metadata("dataset-1", metadata_args)
def test_create_metadata_should_raise_value_error_when_metadata_name_already_exists(
mock_db: MagicMock,
mock_current_account: MagicMock,
) -> None:
# Arrange
metadata_args = MetadataArgs(type="string", name="priority")
mock_db.session.query.return_value.filter_by.return_value.first.return_value = object()
# Act + Assert
with pytest.raises(ValueError, match="already exists"):
MetadataService.create_metadata("dataset-1", metadata_args)
# Assert
mock_current_account.assert_called_once()
def test_create_metadata_should_raise_value_error_when_name_collides_with_builtin(
mock_db: MagicMock, mock_current_account: MagicMock
) -> None:
# Arrange
metadata_args = MetadataArgs(type="string", name=BuiltInField.document_name)
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
# Act + Assert
with pytest.raises(ValueError, match="Built-in fields"):
MetadataService.create_metadata("dataset-1", metadata_args)
def test_create_metadata_should_persist_metadata_when_input_is_valid(
mock_db: MagicMock, mock_current_account: MagicMock
) -> None:
# Arrange
metadata_args = MetadataArgs(type="number", name="score")
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
# Act
result = MetadataService.create_metadata("dataset-1", metadata_args)
# Assert
assert result.tenant_id == "tenant-1"
assert result.dataset_id == "dataset-1"
assert result.type == "number"
assert result.name == "score"
assert result.created_by == "user-1"
mock_db.session.add.assert_called_once_with(result)
mock_db.session.commit.assert_called_once()
mock_current_account.assert_called_once()
def test_update_metadata_name_should_raise_value_error_when_name_exceeds_limit() -> None:
# Arrange
too_long_name = "x" * 256
# Act + Assert
with pytest.raises(ValueError, match="cannot exceed 255"):
MetadataService.update_metadata_name("dataset-1", "metadata-1", too_long_name)
def test_update_metadata_name_should_raise_value_error_when_duplicate_name_exists(
mock_db: MagicMock, mock_current_account: MagicMock
) -> None:
# Arrange
mock_db.session.query.return_value.filter_by.return_value.first.return_value = object()
# Act + Assert
with pytest.raises(ValueError, match="already exists"):
MetadataService.update_metadata_name("dataset-1", "metadata-1", "duplicate")
# Assert
mock_current_account.assert_called_once()
def test_update_metadata_name_should_raise_value_error_when_name_collides_with_builtin(
mock_db: MagicMock,
mock_current_account: MagicMock,
) -> None:
# Arrange
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
# Act + Assert
with pytest.raises(ValueError, match="Built-in fields"):
MetadataService.update_metadata_name("dataset-1", "metadata-1", BuiltInField.source)
# Assert
mock_current_account.assert_called_once()
def test_update_metadata_name_should_update_bound_documents_and_return_metadata(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mock_current_account: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
fixed_now = datetime(2025, 2, 1, 0, 0, tzinfo=UTC)
mocker.patch("services.metadata_service.naive_utc_now", return_value=fixed_now)
metadata = SimpleNamespace(id="metadata-1", name="old_name", updated_by=None, updated_at=None)
bindings = [SimpleNamespace(document_id="doc-1"), SimpleNamespace(document_id="doc-2")]
query_duplicate = MagicMock()
query_duplicate.filter_by.return_value.first.return_value = None
query_metadata = MagicMock()
query_metadata.filter_by.return_value.first.return_value = metadata
query_bindings = MagicMock()
query_bindings.filter_by.return_value.all.return_value = bindings
mock_db.session.query.side_effect = [query_duplicate, query_metadata, query_bindings]
doc_1 = _build_document("1", {"old_name": "value", "other": "keep"})
doc_2 = _build_document("2", None)
mock_get_documents = mocker.patch("services.metadata_service.DocumentService.get_document_by_ids")
mock_get_documents.return_value = [doc_1, doc_2]
# Act
result = MetadataService.update_metadata_name("dataset-1", "metadata-1", "new_name")
# Assert
assert result is metadata
assert metadata.name == "new_name"
assert metadata.updated_by == "user-1"
assert metadata.updated_at == fixed_now
assert doc_1.doc_metadata == {"other": "keep", "new_name": "value"}
assert doc_2.doc_metadata == {"new_name": None}
mock_get_documents.assert_called_once_with(["doc-1", "doc-2"])
mock_db.session.commit.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
mock_current_account.assert_called_once()
def test_update_metadata_name_should_return_none_when_metadata_does_not_exist(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mock_current_account: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
mock_logger = mocker.patch("services.metadata_service.logger")
query_duplicate = MagicMock()
query_duplicate.filter_by.return_value.first.return_value = None
query_metadata = MagicMock()
query_metadata.filter_by.return_value.first.return_value = None
mock_db.session.query.side_effect = [query_duplicate, query_metadata]
# Act
result = MetadataService.update_metadata_name("dataset-1", "missing-id", "new_name")
# Assert
assert result is None
mock_logger.exception.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
mock_current_account.assert_called_once()
def test_delete_metadata_should_remove_metadata_and_related_document_fields(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
metadata = SimpleNamespace(id="metadata-1", name="obsolete")
bindings = [SimpleNamespace(document_id="doc-1")]
query_metadata = MagicMock()
query_metadata.filter_by.return_value.first.return_value = metadata
query_bindings = MagicMock()
query_bindings.filter_by.return_value.all.return_value = bindings
mock_db.session.query.side_effect = [query_metadata, query_bindings]
document = _build_document("1", {"obsolete": "legacy", "remaining": "value"})
mocker.patch("services.metadata_service.DocumentService.get_document_by_ids", return_value=[document])
# Act
result = MetadataService.delete_metadata("dataset-1", "metadata-1")
# Assert
assert result is metadata
assert document.doc_metadata == {"remaining": "value"}
mock_db.session.delete.assert_called_once_with(metadata)
mock_db.session.commit.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
def test_delete_metadata_should_return_none_when_metadata_is_missing(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
mock_logger = mocker.patch("services.metadata_service.logger")
# Act
result = MetadataService.delete_metadata("dataset-1", "missing-id")
# Assert
assert result is None
mock_logger.exception.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
def test_get_built_in_fields_should_return_all_expected_fields() -> None:
# Arrange
expected_names = {
BuiltInField.document_name,
BuiltInField.uploader,
BuiltInField.upload_date,
BuiltInField.last_update_date,
BuiltInField.source,
}
# Act
result = MetadataService.get_built_in_fields()
# Assert
assert {item["name"] for item in result} == expected_names
assert [item["type"] for item in result] == ["string", "string", "time", "time", "string"]
def test_enable_built_in_field_should_return_immediately_when_already_enabled(
mock_db: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
dataset = _dataset(id="dataset-1", built_in_field_enabled=True)
get_docs = mocker.patch("services.metadata_service.DocumentService.get_working_documents_by_dataset_id")
# Act
MetadataService.enable_built_in_field(dataset)
# Assert
get_docs.assert_not_called()
mock_db.session.commit.assert_not_called()
def test_enable_built_in_field_should_populate_documents_and_enable_flag(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
dataset = _dataset(id="dataset-1", built_in_field_enabled=False)
doc_1 = _build_document("1", {"custom": "value"})
doc_2 = _build_document("2", None)
mocker.patch(
"services.metadata_service.DocumentService.get_working_documents_by_dataset_id",
return_value=[doc_1, doc_2],
)
# Act
MetadataService.enable_built_in_field(dataset)
# Assert
assert dataset.built_in_field_enabled is True
assert doc_1.doc_metadata is not None
assert doc_1.doc_metadata[BuiltInField.document_name] == "doc-1"
assert doc_1.doc_metadata[BuiltInField.source] == MetadataDataSource.upload_file
assert doc_2.doc_metadata is not None
assert doc_2.doc_metadata[BuiltInField.uploader] == "qa@example.com"
mock_db.session.commit.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
def test_disable_built_in_field_should_return_immediately_when_already_disabled(
mock_db: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
dataset = _dataset(id="dataset-1", built_in_field_enabled=False)
get_docs = mocker.patch("services.metadata_service.DocumentService.get_working_documents_by_dataset_id")
# Act
MetadataService.disable_built_in_field(dataset)
# Assert
get_docs.assert_not_called()
mock_db.session.commit.assert_not_called()
def test_disable_built_in_field_should_remove_builtin_keys_and_disable_flag(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
dataset = _dataset(id="dataset-1", built_in_field_enabled=True)
document = _build_document(
"1",
{
BuiltInField.document_name: "doc",
BuiltInField.uploader: "user",
BuiltInField.upload_date: 1.0,
BuiltInField.last_update_date: 2.0,
BuiltInField.source: MetadataDataSource.upload_file,
"custom": "keep",
},
)
mocker.patch(
"services.metadata_service.DocumentService.get_working_documents_by_dataset_id",
return_value=[document],
)
# Act
MetadataService.disable_built_in_field(dataset)
# Assert
assert dataset.built_in_field_enabled is False
assert document.doc_metadata == {"custom": "keep"}
mock_db.session.commit.assert_called_once()
mock_redis_client.delete.assert_called_once_with("dataset_metadata_lock_dataset-1")
def test_update_documents_metadata_should_replace_metadata_and_create_bindings_on_full_update(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mock_current_account: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
dataset = _dataset(id="dataset-1", built_in_field_enabled=False)
document = _build_document("1", {"legacy": "value"})
mocker.patch("services.metadata_service.DocumentService.get_document", return_value=document)
delete_chain = mock_db.session.query.return_value.filter_by.return_value
delete_chain.delete.return_value = 1
operation = DocumentMetadataOperation(
document_id="1",
metadata_list=[MetadataDetail(id="meta-1", name="priority", value="high")],
partial_update=False,
)
metadata_args = MetadataOperationData(operation_data=[operation])
# Act
MetadataService.update_documents_metadata(dataset, metadata_args)
# Assert
assert document.doc_metadata == {"priority": "high"}
delete_chain.delete.assert_called_once()
assert mock_db.session.commit.call_count == 1
mock_redis_client.delete.assert_called_once_with("document_metadata_lock_1")
mock_current_account.assert_called_once()
def test_update_documents_metadata_should_skip_existing_binding_and_preserve_existing_fields_on_partial_update(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mock_current_account: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
dataset = _dataset(id="dataset-1", built_in_field_enabled=True)
document = _build_document("1", {"existing": "value"})
mocker.patch("services.metadata_service.DocumentService.get_document", return_value=document)
mock_db.session.query.return_value.filter_by.return_value.first.return_value = object()
operation = DocumentMetadataOperation(
document_id="1",
metadata_list=[MetadataDetail(id="meta-1", name="new_key", value="new_value")],
partial_update=True,
)
metadata_args = MetadataOperationData(operation_data=[operation])
# Act
MetadataService.update_documents_metadata(dataset, metadata_args)
# Assert
assert document.doc_metadata is not None
assert document.doc_metadata["existing"] == "value"
assert document.doc_metadata["new_key"] == "new_value"
assert document.doc_metadata[BuiltInField.source] == MetadataDataSource.upload_file
assert mock_db.session.commit.call_count == 1
assert mock_db.session.add.call_count == 1
mock_redis_client.delete.assert_called_once_with("document_metadata_lock_1")
mock_current_account.assert_called_once()
def test_update_documents_metadata_should_raise_and_rollback_when_document_not_found(
mock_db: MagicMock,
mock_redis_client: MagicMock,
mocker: MockerFixture,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
dataset = _dataset(id="dataset-1", built_in_field_enabled=False)
mocker.patch("services.metadata_service.DocumentService.get_document", return_value=None)
operation = DocumentMetadataOperation(document_id="404", metadata_list=[], partial_update=True)
metadata_args = MetadataOperationData(operation_data=[operation])
# Act + Assert
with pytest.raises(ValueError, match="Document not found"):
MetadataService.update_documents_metadata(dataset, metadata_args)
# Assert
mock_db.session.rollback.assert_called_once()
mock_redis_client.delete.assert_called_once_with("document_metadata_lock_404")
@pytest.mark.parametrize(
("dataset_id", "document_id", "expected_key"),
[
("dataset-1", None, "dataset_metadata_lock_dataset-1"),
(None, "doc-1", "document_metadata_lock_doc-1"),
],
)
def test_knowledge_base_metadata_lock_check_should_set_lock_when_not_already_locked(
dataset_id: str | None,
document_id: str | None,
expected_key: str,
mock_redis_client: MagicMock,
) -> None:
# Arrange
mock_redis_client.get.return_value = None
# Act
MetadataService.knowledge_base_metadata_lock_check(dataset_id, document_id)
# Assert
mock_redis_client.set.assert_called_once_with(expected_key, 1, ex=3600)
def test_knowledge_base_metadata_lock_check_should_raise_when_dataset_lock_exists(
mock_redis_client: MagicMock,
) -> None:
# Arrange
mock_redis_client.get.return_value = 1
# Act + Assert
with pytest.raises(ValueError, match="knowledge base metadata operation is running"):
MetadataService.knowledge_base_metadata_lock_check("dataset-1", None)
def test_knowledge_base_metadata_lock_check_should_raise_when_document_lock_exists(
mock_redis_client: MagicMock,
) -> None:
# Arrange
mock_redis_client.get.return_value = 1
# Act + Assert
with pytest.raises(ValueError, match="document metadata operation is running"):
MetadataService.knowledge_base_metadata_lock_check(None, "doc-1")
def test_get_dataset_metadatas_should_exclude_builtin_and_include_binding_counts(mock_db: MagicMock) -> None:
# Arrange
dataset = _dataset(
id="dataset-1",
built_in_field_enabled=True,
doc_metadata=[
{"id": "meta-1", "name": "priority", "type": "string"},
{"id": "built-in", "name": "ignored", "type": "string"},
{"id": "meta-2", "name": "score", "type": "number"},
],
)
count_chain = mock_db.session.query.return_value.filter_by.return_value
count_chain.count.side_effect = [3, 1]
# Act
result = MetadataService.get_dataset_metadatas(dataset)
# Assert
assert result["built_in_field_enabled"] is True
assert result["doc_metadata"] == [
{"id": "meta-1", "name": "priority", "type": "string", "count": 3},
{"id": "meta-2", "name": "score", "type": "number", "count": 1},
]
def test_get_dataset_metadatas_should_return_empty_list_when_no_metadata(mock_db: MagicMock) -> None:
# Arrange
dataset = _dataset(id="dataset-1", built_in_field_enabled=False, doc_metadata=None)
# Act
result = MetadataService.get_dataset_metadatas(dataset)
# Assert
assert result == {"doc_metadata": [], "built_in_field_enabled": False}
mock_db.session.query.assert_not_called()