Merge branch 'main' into feat/mcp

2026-05-02 00:18:03 +08:00 · 2025-06-23 15:01:18 +08:00
parent 5f6f02350e 3113350e51
commit 1fc3f88f82
56 changed files with 2259 additions and 597 deletions
--- a/api/controllers/console/admin.py
+++ b/api/controllers/console/admin.py
@ -56,8 +56,7 @@ class InsertExploreAppListApi(Resource):
        parser.add_argument("position", type=int, required=True, nullable=False, location="json")
        args = parser.parse_args()

-        with Session(db.engine) as session:
-            app = session.execute(select(App).filter(App.id == args["app_id"])).scalar_one_or_none()
+        app = db.session.execute(select(App).filter(App.id == args["app_id"])).scalar_one_or_none()
        if not app:
            raise NotFound(f"App '{args['app_id']}' is not found")

@ -78,38 +77,38 @@ class InsertExploreAppListApi(Resource):
                select(RecommendedApp).filter(RecommendedApp.app_id == args["app_id"])
            ).scalar_one_or_none()

-        if not recommended_app:
-            recommended_app = RecommendedApp(
-                app_id=app.id,
-                description=desc,
-                copyright=copy_right,
-                privacy_policy=privacy_policy,
-                custom_disclaimer=custom_disclaimer,
-                language=args["language"],
-                category=args["category"],
-                position=args["position"],
-            )
+            if not recommended_app:
+                recommended_app = RecommendedApp(
+                    app_id=app.id,
+                    description=desc,
+                    copyright=copy_right,
+                    privacy_policy=privacy_policy,
+                    custom_disclaimer=custom_disclaimer,
+                    language=args["language"],
+                    category=args["category"],
+                    position=args["position"],
+                )

-            db.session.add(recommended_app)
+                db.session.add(recommended_app)

-            app.is_public = True
-            db.session.commit()
+                app.is_public = True
+                db.session.commit()

-            return {"result": "success"}, 201
-        else:
-            recommended_app.description = desc
-            recommended_app.copyright = copy_right
-            recommended_app.privacy_policy = privacy_policy
-            recommended_app.custom_disclaimer = custom_disclaimer
-            recommended_app.language = args["language"]
-            recommended_app.category = args["category"]
-            recommended_app.position = args["position"]
+                return {"result": "success"}, 201
+            else:
+                recommended_app.description = desc
+                recommended_app.copyright = copy_right
+                recommended_app.privacy_policy = privacy_policy
+                recommended_app.custom_disclaimer = custom_disclaimer
+                recommended_app.language = args["language"]
+                recommended_app.category = args["category"]
+                recommended_app.position = args["position"]

-            app.is_public = True
+                app.is_public = True

-            db.session.commit()
+                db.session.commit()

-            return {"result": "success"}, 200
+                return {"result": "success"}, 200


 class InsertExploreAppApi(Resource):
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@ -43,7 +43,6 @@ from core.model_runtime.errors.invoke import InvokeAuthorizationError
 from core.plugin.impl.exc import PluginDaemonClientSideError
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from extensions.ext_database import db
-from extensions.ext_redis import redis_client
 from fields.document_fields import (
    dataset_and_document_fields,
    document_fields,
@ -54,8 +53,6 @@ from libs.login import login_required
 from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
 from services.dataset_service import DatasetService, DocumentService
 from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
-from tasks.add_document_to_index_task import add_document_to_index_task
-from tasks.remove_document_from_index_task import remove_document_from_index_task


 class DocumentResource(Resource):
@ -862,77 +859,16 @@ class DocumentStatusApi(DocumentResource):
        DatasetService.check_dataset_permission(dataset, current_user)

        document_ids = request.args.getlist("document_id")
-        for document_id in document_ids:
-            document = self.get_document(dataset_id, document_id)

-            indexing_cache_key = "document_{}_indexing".format(document.id)
-            cache_result = redis_client.get(indexing_cache_key)
-            if cache_result is not None:
-                raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later")
+        try:
+            DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
+        except services.errors.document.DocumentIndexingError as e:
+            raise InvalidActionError(str(e))
+        except ValueError as e:
+            raise InvalidActionError(str(e))
+        except NotFound as e:
+            raise NotFound(str(e))

-            if action == "enable":
-                if document.enabled:
-                    continue
-                document.enabled = True
-                document.disabled_at = None
-                document.disabled_by = None
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                add_document_to_index_task.delay(document_id)
-
-            elif action == "disable":
-                if not document.completed_at or document.indexing_status != "completed":
-                    raise InvalidActionError(f"Document: {document.name} is not completed.")
-                if not document.enabled:
-                    continue
-
-                document.enabled = False
-                document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
-                document.disabled_by = current_user.id
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                remove_document_from_index_task.delay(document_id)
-
-            elif action == "archive":
-                if document.archived:
-                    continue
-
-                document.archived = True
-                document.archived_at = datetime.now(UTC).replace(tzinfo=None)
-                document.archived_by = current_user.id
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                if document.enabled:
-                    # Set cache to prevent indexing the same document multiple times
-                    redis_client.setex(indexing_cache_key, 600, 1)
-
-                    remove_document_from_index_task.delay(document_id)
-
-            elif action == "un_archive":
-                if not document.archived:
-                    continue
-                document.archived = False
-                document.archived_at = None
-                document.archived_by = None
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                add_document_to_index_task.delay(document_id)
-
-            else:
-                raise InvalidActionError()
        return {"result": "success"}, 200


--- a/api/controllers/service_api/dataset/dataset.py
+++ b/api/controllers/service_api/dataset/dataset.py
@ -4,7 +4,7 @@ from werkzeug.exceptions import Forbidden, NotFound

 import services.dataset_service
 from controllers.service_api import api
-from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError
+from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError, InvalidActionError
 from controllers.service_api.wraps import (
    DatasetApiResource,
    cloud_edition_billing_rate_limit_check,
@ -17,7 +17,7 @@ from fields.dataset_fields import dataset_detail_fields
 from fields.tag_fields import tag_fields
 from libs.login import current_user
 from models.dataset import Dataset, DatasetPermissionEnum
-from services.dataset_service import DatasetPermissionService, DatasetService
+from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService
 from services.entities.knowledge_entities.knowledge_entities import RetrievalModel
 from services.tag_service import TagService

@ -329,6 +329,56 @@ class DatasetApi(DatasetApiResource):
            raise DatasetInUseError()


+class DocumentStatusApi(DatasetApiResource):
+    """Resource for batch document status operations."""
+
+    def patch(self, tenant_id, dataset_id, action):
+        """
+        Batch update document status.
+
+        Args:
+            tenant_id: tenant id
+            dataset_id: dataset id
+            action: action to perform (enable, disable, archive, un_archive)
+
+        Returns:
+            dict: A dictionary with a key 'result' and a value 'success'
+            int: HTTP status code 200 indicating that the operation was successful.
+
+        Raises:
+            NotFound: If the dataset with the given ID does not exist.
+            Forbidden: If the user does not have permission.
+            InvalidActionError: If the action is invalid or cannot be performed.
+        """
+        dataset_id_str = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id_str)
+
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+
+        # Check user's permission
+        try:
+            DatasetService.check_dataset_permission(dataset, current_user)
+        except services.errors.account.NoPermissionError as e:
+            raise Forbidden(str(e))
+
+        # Check dataset model setting
+        DatasetService.check_dataset_model_setting(dataset)
+
+        # Get document IDs from request body
+        data = request.get_json()
+        document_ids = data.get("document_ids", [])
+
+        try:
+            DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
+        except services.errors.document.DocumentIndexingError as e:
+            raise InvalidActionError(str(e))
+        except ValueError as e:
+            raise InvalidActionError(str(e))
+
+        return {"result": "success"}, 200
+
+
 class DatasetTagsApi(DatasetApiResource):
    @validate_dataset_token
    @marshal_with(tag_fields)
@ -457,6 +507,7 @@ class DatasetTagsBindingStatusApi(DatasetApiResource):

 api.add_resource(DatasetListApi, "/datasets")
 api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>")
+api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>")
 api.add_resource(DatasetTagsApi, "/datasets/tags")
 api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding")
 api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding")
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
                continue
            header_match = re.match(r"^#+\s", line)
            if header_match:
-                if current_header is not None:
-                    markdown_tups.append((current_header, current_text))
-
+                markdown_tups.append((current_header, current_text))
                current_header = line
                current_text = ""
            else:
                current_text += line + "\n"
        markdown_tups.append((current_header, current_text))

-        if current_header is not None:
-            # pass linting, assert keys are defined
-            markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
-            ]
-        else:
-            markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
+        markdown_tups = [
+            (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
+            for key, value in markdown_tups
+        ]

        return markdown_tups

--- a/api/core/workflow/nodes/variable_assigner/v2/constants.py
+++ b/api/core/workflow/nodes/variable_assigner/v2/constants.py
@ -8,5 +8,4 @@ EMPTY_VALUE_MAPPING = {
    SegmentType.ARRAY_STRING: [],
    SegmentType.ARRAY_NUMBER: [],
    SegmentType.ARRAY_OBJECT: [],
-    SegmentType.ARRAY_FILE: [],
 }
--- a/api/core/workflow/nodes/variable_assigner/v2/helpers.py
+++ b/api/core/workflow/nodes/variable_assigner/v2/helpers.py
@ -1,6 +1,5 @@
 from typing import Any

-from core.file import File
 from core.variables import SegmentType

 from .enums import Operation
@ -86,8 +85,6 @@ def is_input_value_valid(*, variable_type: SegmentType, operation: Operation, va
            return isinstance(value, int | float)
        case SegmentType.ARRAY_OBJECT if operation == Operation.APPEND:
            return isinstance(value, dict)
-        case SegmentType.ARRAY_FILE if operation == Operation.APPEND:
-            return isinstance(value, File)

        # Array & Extend / Overwrite
        case SegmentType.ARRAY_ANY if operation in {Operation.EXTEND, Operation.OVER_WRITE}:
@ -98,8 +95,6 @@ def is_input_value_valid(*, variable_type: SegmentType, operation: Operation, va
            return isinstance(value, list) and all(isinstance(item, int | float) for item in value)
        case SegmentType.ARRAY_OBJECT if operation in {Operation.EXTEND, Operation.OVER_WRITE}:
            return isinstance(value, list) and all(isinstance(item, dict) for item in value)
-        case SegmentType.ARRAY_FILE if operation in {Operation.EXTEND, Operation.OVER_WRITE}:
-            return isinstance(value, list) and all(isinstance(item, File) for item in value)

        case _:
            return False
--- a/api/factories/variable_factory.py
+++ b/api/factories/variable_factory.py
@ -101,8 +101,6 @@ def _build_variable_from_mapping(*, mapping: Mapping[str, Any], selector: Sequen
            result = ArrayNumberVariable.model_validate(mapping)
        case SegmentType.ARRAY_OBJECT if isinstance(value, list):
            result = ArrayObjectVariable.model_validate(mapping)
-        case SegmentType.ARRAY_FILE if isinstance(value, list):
-            result = ArrayFileVariable.model_validate(mapping)
        case _:
            raise VariableError(f"not supported value type {value_type}")
    if result.size > dify_config.MAX_VARIABLE_SIZE:
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -59,6 +59,7 @@ from services.external_knowledge_service import ExternalDatasetService
 from services.feature_service import FeatureModel, FeatureService
 from services.tag_service import TagService
 from services.vector_service import VectorService
+from tasks.add_document_to_index_task import add_document_to_index_task
 from tasks.batch_clean_document_task import batch_clean_document_task
 from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
@ -70,6 +71,7 @@ from tasks.document_indexing_update_task import document_indexing_update_task
 from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task
 from tasks.enable_segments_to_index_task import enable_segments_to_index_task
 from tasks.recover_document_indexing_task import recover_document_indexing_task
+from tasks.remove_document_from_index_task import remove_document_from_index_task
 from tasks.retry_document_indexing_task import retry_document_indexing_task
 from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task

@ -434,7 +436,7 @@ class DatasetService:
                        raise ValueError(ex.description)

            filtered_data["updated_by"] = user.id
-            filtered_data["updated_at"] = datetime.datetime.now()
+            filtered_data["updated_at"] = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)

            # update Retrieval model
            filtered_data["retrieval_model"] = data["retrieval_model"]
@ -976,12 +978,17 @@ class DocumentService:
                process_rule = knowledge_config.process_rule
                if process_rule:
                    if process_rule.mode in ("custom", "hierarchical"):
-                        dataset_process_rule = DatasetProcessRule(
-                            dataset_id=dataset.id,
-                            mode=process_rule.mode,
-                            rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
-                            created_by=account.id,
-                        )
+                        if process_rule.rules:
+                            dataset_process_rule = DatasetProcessRule(
+                                dataset_id=dataset.id,
+                                mode=process_rule.mode,
+                                rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
+                                created_by=account.id,
+                            )
+                        else:
+                            dataset_process_rule = dataset.latest_process_rule
+                            if not dataset_process_rule:
+                                raise ValueError("No process rule found.")
                    elif process_rule.mode == "automatic":
                        dataset_process_rule = DatasetProcessRule(
                            dataset_id=dataset.id,
@ -1603,6 +1610,191 @@ class DocumentService:
            if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int):
                raise ValueError("Process rule segmentation max_tokens is invalid")

+    @staticmethod
+    def batch_update_document_status(dataset: Dataset, document_ids: list[str], action: str, user):
+        """
+        Batch update document status.
+
+        Args:
+            dataset (Dataset): The dataset object
+            document_ids (list[str]): List of document IDs to update
+            action (str): Action to perform (enable, disable, archive, un_archive)
+            user: Current user performing the action
+
+        Raises:
+            DocumentIndexingError: If document is being indexed or not in correct state
+            ValueError: If action is invalid
+        """
+        if not document_ids:
+            return
+
+        # Early validation of action parameter
+        valid_actions = ["enable", "disable", "archive", "un_archive"]
+        if action not in valid_actions:
+            raise ValueError(f"Invalid action: {action}. Must be one of {valid_actions}")
+
+        documents_to_update = []
+
+        # First pass: validate all documents and prepare updates
+        for document_id in document_ids:
+            document = DocumentService.get_document(dataset.id, document_id)
+            if not document:
+                continue
+
+            # Check if document is being indexed
+            indexing_cache_key = f"document_{document.id}_indexing"
+            cache_result = redis_client.get(indexing_cache_key)
+            if cache_result is not None:
+                raise DocumentIndexingError(f"Document:{document.name} is being indexed, please try again later")
+
+            # Prepare update based on action
+            update_info = DocumentService._prepare_document_status_update(document, action, user)
+            if update_info:
+                documents_to_update.append(update_info)
+
+        # Second pass: apply all updates in a single transaction
+        if documents_to_update:
+            try:
+                for update_info in documents_to_update:
+                    document = update_info["document"]
+                    updates = update_info["updates"]
+
+                    # Apply updates to the document
+                    for field, value in updates.items():
+                        setattr(document, field, value)
+
+                    db.session.add(document)
+
+                # Batch commit all changes
+                db.session.commit()
+            except Exception as e:
+                # Rollback on any error
+                db.session.rollback()
+                raise e
+            # Execute async tasks and set Redis cache after successful commit
+            # propagation_error is used to capture any errors for submitting async task execution
+            propagation_error = None
+            for update_info in documents_to_update:
+                try:
+                    # Execute async tasks after successful commit
+                    if update_info["async_task"]:
+                        task_info = update_info["async_task"]
+                        task_func = task_info["function"]
+                        task_args = task_info["args"]
+                        task_func.delay(*task_args)
+                except Exception as e:
+                    # Log the error but do not rollback the transaction
+                    logging.exception(f"Error executing async task for document {update_info['document'].id}")
+                    # don't raise the error immediately, but capture it for later
+                    propagation_error = e
+                try:
+                    # Set Redis cache if needed after successful commit
+                    if update_info["set_cache"]:
+                        document = update_info["document"]
+                        indexing_cache_key = f"document_{document.id}_indexing"
+                        redis_client.setex(indexing_cache_key, 600, 1)
+                except Exception as e:
+                    # Log the error but do not rollback the transaction
+                    logging.exception(f"Error setting cache for document {update_info['document'].id}")
+            # Raise any propagation error after all updates
+            if propagation_error:
+                raise propagation_error
+
+    @staticmethod
+    def _prepare_document_status_update(document, action: str, user):
+        """
+        Prepare document status update information.
+
+        Args:
+            document: Document object to update
+            action: Action to perform
+            user: Current user
+
+        Returns:
+            dict: Update information or None if no update needed
+        """
+        now = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+
+        if action == "enable":
+            return DocumentService._prepare_enable_update(document, now)
+        elif action == "disable":
+            return DocumentService._prepare_disable_update(document, user, now)
+        elif action == "archive":
+            return DocumentService._prepare_archive_update(document, user, now)
+        elif action == "un_archive":
+            return DocumentService._prepare_unarchive_update(document, now)
+
+        return None
+
+    @staticmethod
+    def _prepare_enable_update(document, now):
+        """Prepare updates for enabling a document."""
+        if document.enabled:
+            return None
+
+        return {
+            "document": document,
+            "updates": {"enabled": True, "disabled_at": None, "disabled_by": None, "updated_at": now},
+            "async_task": {"function": add_document_to_index_task, "args": [document.id]},
+            "set_cache": True,
+        }
+
+    @staticmethod
+    def _prepare_disable_update(document, user, now):
+        """Prepare updates for disabling a document."""
+        if not document.completed_at or document.indexing_status != "completed":
+            raise DocumentIndexingError(f"Document: {document.name} is not completed.")
+
+        if not document.enabled:
+            return None
+
+        return {
+            "document": document,
+            "updates": {"enabled": False, "disabled_at": now, "disabled_by": user.id, "updated_at": now},
+            "async_task": {"function": remove_document_from_index_task, "args": [document.id]},
+            "set_cache": True,
+        }
+
+    @staticmethod
+    def _prepare_archive_update(document, user, now):
+        """Prepare updates for archiving a document."""
+        if document.archived:
+            return None
+
+        update_info = {
+            "document": document,
+            "updates": {"archived": True, "archived_at": now, "archived_by": user.id, "updated_at": now},
+            "async_task": None,
+            "set_cache": False,
+        }
+
+        # Only set async task and cache if document is currently enabled
+        if document.enabled:
+            update_info["async_task"] = {"function": remove_document_from_index_task, "args": [document.id]}
+            update_info["set_cache"] = True
+
+        return update_info
+
+    @staticmethod
+    def _prepare_unarchive_update(document, now):
+        """Prepare updates for unarchiving a document."""
+        if not document.archived:
+            return None
+
+        update_info = {
+            "document": document,
+            "updates": {"archived": False, "archived_at": None, "archived_by": None, "updated_at": now},
+            "async_task": None,
+            "set_cache": False,
+        }
+
+        # Only re-index if the document is currently enabled
+        if document.enabled:
+            update_info["async_task"] = {"function": add_document_to_index_task, "args": [document.id]}
+            update_info["set_cache"] = True
+
+        return update_info
+

 class SegmentService:
    @classmethod
--- a/api/services/plugin/data_migration.py
+++ b/api/services/plugin/data_migration.py
@ -22,7 +22,7 @@ class PluginDataMigration:
        cls.migrate_datasets()
        cls.migrate_db_records("embeddings", "provider_name", ModelProviderID)  # large table
        cls.migrate_db_records("dataset_collection_bindings", "provider_name", ModelProviderID)
-        cls.migrate_db_records("tool_builtin_providers", "provider_name", ToolProviderID)
+        cls.migrate_db_records("tool_builtin_providers", "provider", ToolProviderID)

    @classmethod
    def migrate_datasets(cls) -> None:
--- a/api/tests/unit_tests/conftest.py
+++ b/api/tests/unit_tests/conftest.py
@ -1,4 +1,5 @@
 import os
+from unittest.mock import MagicMock, patch

 import pytest
 from flask import Flask
@ -11,6 +12,24 @@ PROJECT_DIR = os.path.abspath(os.path.join(ABS_PATH, os.pardir, os.pardir))

 CACHED_APP = Flask(__name__)

+# set global mock for Redis client
+redis_mock = MagicMock()
+redis_mock.get = MagicMock(return_value=None)
+redis_mock.setex = MagicMock()
+redis_mock.setnx = MagicMock()
+redis_mock.delete = MagicMock()
+redis_mock.lock = MagicMock()
+redis_mock.exists = MagicMock(return_value=False)
+redis_mock.set = MagicMock()
+redis_mock.expire = MagicMock()
+redis_mock.hgetall = MagicMock(return_value={})
+redis_mock.hdel = MagicMock()
+redis_mock.incr = MagicMock(return_value=1)
+
+# apply the mock to the Redis client in the Flask app
+redis_patcher = patch("extensions.ext_redis.redis_client", redis_mock)
+redis_patcher.start()
+

@pytest.fixture
 def app() -> Flask:
@ -21,3 +40,19 @@ def app() -> Flask:
 def _provide_app_context(app: Flask):
    with app.app_context():
        yield
+
+
+@pytest.fixture(autouse=True)
+def reset_redis_mock():
+    """reset the Redis mock before each test"""
+    redis_mock.reset_mock()
+    redis_mock.get.return_value = None
+    redis_mock.setex.return_value = None
+    redis_mock.setnx.return_value = None
+    redis_mock.delete.return_value = None
+    redis_mock.exists.return_value = False
+    redis_mock.set.return_value = None
+    redis_mock.expire.return_value = None
+    redis_mock.hgetall.return_value = {}
+    redis_mock.hdel.return_value = None
+    redis_mock.incr.return_value = 1
--- a/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
@ -0,0 +1,22 @@
+from core.rag.extractor.markdown_extractor import MarkdownExtractor
+
+
+def test_markdown_to_tups():
+    markdown = """
+this is some text without header
+
+# title 1
+this is balabala text
+
+## title 2
+this is more specific text.
+        """
+    extractor = MarkdownExtractor(file_path="dummy_path")
+    updated_output = extractor.markdown_to_tups(markdown)
+    assert len(updated_output) == 3
+    key, header_value = updated_output[0]
+    assert key == None
+    assert header_value.strip() == "this is some text without header"
+    title_1, value = updated_output[1]
+    assert title_1.strip() == "title 1"
+    assert value.strip() == "this is balabala text"
--- a/api/tests/unit_tests/services/test_dataset_service.py
+++ b/api/tests/unit_tests/services/test_dataset_service.py