Merge remote-tracking branch 'origin/main' into feat/support-agent-sandbox

2026-04-25 05:06:15 +08:00 · 2026-03-25 11:50:33 +08:00
parent 79e59d8957 d87263f7c3
commit 21bdb6da47
82 changed files with 1074 additions and 1135 deletions
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -3,7 +3,7 @@ from typing import Any, cast
 from flask import request
 from flask_restx import Resource, fields, marshal, marshal_with
 from pydantic import BaseModel, Field, field_validator
-from sqlalchemy import select
+from sqlalchemy import func, select
 from werkzeug.exceptions import Forbidden, NotFound

 import services
@ -29,6 +29,7 @@ from core.provider_manager import ProviderManager
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting, NotionInfo, WebsiteInfo
+from core.rag.index_processor.constant.index_type import IndexTechniqueType
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
 from dify_graph.model_runtime.entities.model_entities import ModelType
 from extensions.ext_database import db
@ -355,7 +356,7 @@ class DatasetListApi(Resource):

        for item in data:
            # convert embedding_model_provider to plugin standard format
-            if item["indexing_technique"] == "high_quality" and item["embedding_model_provider"]:
+            if item["indexing_technique"] == IndexTechniqueType.HIGH_QUALITY and item["embedding_model_provider"]:
                item["embedding_model_provider"] = str(ModelProviderID(item["embedding_model_provider"]))
                item_model = f"{item['embedding_model']}:{item['embedding_model_provider']}"
                if item_model in model_names:
@ -436,7 +437,7 @@ class DatasetApi(Resource):
        except services.errors.account.NoPermissionError as e:
            raise Forbidden(str(e))
        data = cast(dict[str, Any], marshal(dataset, dataset_detail_fields))
-        if dataset.indexing_technique == "high_quality":
+        if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            if dataset.embedding_model_provider:
                provider_id = ModelProviderID(dataset.embedding_model_provider)
                data["embedding_model_provider"] = str(provider_id)
@ -454,7 +455,7 @@ class DatasetApi(Resource):
        for embedding_model in embedding_models:
            model_names.append(f"{embedding_model.model}:{embedding_model.provider.provider}")

-        if data["indexing_technique"] == "high_quality":
+        if data["indexing_technique"] == IndexTechniqueType.HIGH_QUALITY:
            item_model = f"{data['embedding_model']}:{data['embedding_model_provider']}"
            if item_model in model_names:
                data["embedding_available"] = True
@ -485,7 +486,7 @@ class DatasetApi(Resource):
        current_user, current_tenant_id = current_account_with_tenant()
        # check embedding model setting
        if (
-            payload.indexing_technique == "high_quality"
+            payload.indexing_technique == IndexTechniqueType.HIGH_QUALITY
            and payload.embedding_model_provider is not None
            and payload.embedding_model is not None
        ):
@ -738,20 +739,23 @@ class DatasetIndexingStatusApi(Resource):
        documents_status = []
        for document in documents:
            completed_segments = (
-                db.session.query(DocumentSegment)
-                .where(
-                    DocumentSegment.completed_at.isnot(None),
-                    DocumentSegment.document_id == str(document.id),
-                    DocumentSegment.status != SegmentStatus.RE_SEGMENT,
+                db.session.scalar(
+                    select(func.count(DocumentSegment.id)).where(
+                        DocumentSegment.completed_at.isnot(None),
+                        DocumentSegment.document_id == str(document.id),
+                        DocumentSegment.status != SegmentStatus.RE_SEGMENT,
+                    )
                )
-                .count()
+                or 0
            )
            total_segments = (
-                db.session.query(DocumentSegment)
-                .where(
-                    DocumentSegment.document_id == str(document.id), DocumentSegment.status != SegmentStatus.RE_SEGMENT
+                db.session.scalar(
+                    select(func.count(DocumentSegment.id)).where(
+                        DocumentSegment.document_id == str(document.id),
+                        DocumentSegment.status != SegmentStatus.RE_SEGMENT,
+                    )
                )
-                .count()
+                or 0
            )
            # Create a dictionary with document attributes and additional fields
            document_dict = {
@ -802,9 +806,12 @@ class DatasetApiKeyApi(Resource):
        _, current_tenant_id = current_account_with_tenant()

        current_key_count = (
-            db.session.query(ApiToken)
-            .where(ApiToken.type == self.resource_type, ApiToken.tenant_id == current_tenant_id)
-            .count()
+            db.session.scalar(
+                select(func.count(ApiToken.id)).where(
+                    ApiToken.type == self.resource_type, ApiToken.tenant_id == current_tenant_id
+                )
+            )
+            or 0
        )

        if current_key_count >= self.max_keys:
@ -839,14 +846,14 @@ class DatasetApiDeleteApi(Resource):
    def delete(self, api_key_id):
        _, current_tenant_id = current_account_with_tenant()
        api_key_id = str(api_key_id)
-        key = (
-            db.session.query(ApiToken)
+        key = db.session.scalar(
+            select(ApiToken)
            .where(
                ApiToken.tenant_id == current_tenant_id,
                ApiToken.type == self.resource_type,
                ApiToken.id == api_key_id,
            )
-            .first()
+            .limit(1)
        )

        if key is None:
@ -857,7 +864,7 @@ class DatasetApiDeleteApi(Resource):
        assert key is not None  # nosec - for type checker only
        ApiTokenCache.delete(key.token, key.type)

-        db.session.query(ApiToken).where(ApiToken.id == api_key_id).delete()
+        db.session.delete(key)
        db.session.commit()

        return {"result": "success"}, 204
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@ -27,6 +27,7 @@ from core.model_manager import ModelManager
 from core.plugin.impl.exc import PluginDaemonClientSideError
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting, NotionInfo, WebsiteInfo
+from core.rag.index_processor.constant.index_type import IndexTechniqueType
 from dify_graph.model_runtime.entities.model_entities import ModelType
 from dify_graph.model_runtime.errors.invoke import InvokeAuthorizationError
 from extensions.ext_database import db
@ -449,7 +450,7 @@ class DatasetInitApi(Resource):
            raise Forbidden()

        knowledge_config = KnowledgeConfig.model_validate(console_ns.payload or {})
-        if knowledge_config.indexing_technique == "high_quality":
+        if knowledge_config.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            if knowledge_config.embedding_model is None or knowledge_config.embedding_model_provider is None:
                raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
            try:
@ -463,7 +464,7 @@ class DatasetInitApi(Resource):
                is_multimodal = DatasetService.check_is_multimodal_model(
                    current_tenant_id, knowledge_config.embedding_model_provider, knowledge_config.embedding_model
                )
-                knowledge_config.is_multimodal = is_multimodal
+                knowledge_config.is_multimodal = is_multimodal  # pyrefly: ignore[bad-assignment]
            except InvokeAuthorizationError:
                raise ProviderNotInitializeError(
                    "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
@ -1337,7 +1338,7 @@ class DocumentGenerateSummaryApi(Resource):
            raise BadRequest("document_list cannot be empty.")

        # Check if dataset configuration supports summary generation
-        if dataset.indexing_technique != "high_quality":
+        if dataset.indexing_technique != IndexTechniqueType.HIGH_QUALITY:
            raise ValueError(
                f"Summary generation is only available for 'high_quality' indexing technique. "
                f"Current indexing technique: {dataset.indexing_technique}"
--- a/api/controllers/console/datasets/datasets_segments.py
+++ b/api/controllers/console/datasets/datasets_segments.py
@ -26,6 +26,7 @@ from controllers.console.wraps import (
 )
 from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
 from core.model_manager import ModelManager
+from core.rag.index_processor.constant.index_type import IndexTechniqueType
 from dify_graph.model_runtime.entities.model_entities import ModelType
 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
@ -279,7 +280,7 @@ class DatasetDocumentSegmentApi(Resource):
            DatasetService.check_dataset_permission(dataset, current_user)
        except services.errors.account.NoPermissionError as e:
            raise Forbidden(str(e))
-        if dataset.indexing_technique == "high_quality":
+        if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            # check embedding model setting
            try:
                model_manager = ModelManager()
@ -333,7 +334,7 @@ class DatasetDocumentSegmentAddApi(Resource):
        if not current_user.is_dataset_editor:
            raise Forbidden()
        # check embedding model setting
-        if dataset.indexing_technique == "high_quality":
+        if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            try:
                model_manager = ModelManager()
                model_manager.get_model_instance(
@ -383,7 +384,7 @@ class DatasetDocumentSegmentUpdateApi(Resource):
        document = DocumentService.get_document(dataset_id, document_id)
        if not document:
            raise NotFound("Document not found.")
-        if dataset.indexing_technique == "high_quality":
+        if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            # check embedding model setting
            try:
                model_manager = ModelManager()
@ -401,10 +402,10 @@ class DatasetDocumentSegmentUpdateApi(Resource):
                raise ProviderNotInitializeError(ex.description)
            # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
@ -447,10 +448,10 @@ class DatasetDocumentSegmentUpdateApi(Resource):
            raise NotFound("Document not found.")
        # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
@ -494,7 +495,7 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
        payload = BatchImportPayload.model_validate(console_ns.payload or {})
        upload_file_id = payload.upload_file_id

-        upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
+        upload_file = db.session.scalar(select(UploadFile).where(UploadFile.id == upload_file_id).limit(1))
        if not upload_file:
            raise NotFound("UploadFile not found.")

@ -559,17 +560,17 @@ class ChildChunkAddApi(Resource):
            raise NotFound("Document not found.")
        # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
        if not current_user.is_dataset_editor:
            raise Forbidden()
        # check embedding model setting
-        if dataset.indexing_technique == "high_quality":
+        if dataset.indexing_technique == IndexTechniqueType.HIGH_QUALITY:
            try:
                model_manager = ModelManager()
                model_manager.get_model_instance(
@ -616,10 +617,10 @@ class ChildChunkAddApi(Resource):
            raise NotFound("Document not found.")
        # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
@ -666,10 +667,10 @@ class ChildChunkAddApi(Resource):
            raise NotFound("Document not found.")
            # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
@ -714,24 +715,24 @@ class ChildChunkUpdateApi(Resource):
            raise NotFound("Document not found.")
        # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
        # check child chunk
        child_chunk_id = str(child_chunk_id)
-        child_chunk = (
-            db.session.query(ChildChunk)
+        child_chunk = db.session.scalar(
+            select(ChildChunk)
            .where(
                ChildChunk.id == str(child_chunk_id),
                ChildChunk.tenant_id == current_tenant_id,
                ChildChunk.segment_id == segment.id,
                ChildChunk.document_id == document_id,
            )
-            .first()
+            .limit(1)
        )
        if not child_chunk:
            raise NotFound("Child chunk not found.")
@ -771,24 +772,24 @@ class ChildChunkUpdateApi(Resource):
            raise NotFound("Document not found.")
            # check segment
        segment_id = str(segment_id)
-        segment = (
-            db.session.query(DocumentSegment)
+        segment = db.session.scalar(
+            select(DocumentSegment)
            .where(DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_tenant_id)
-            .first()
+            .limit(1)
        )
        if not segment:
            raise NotFound("Segment not found.")
        # check child chunk
        child_chunk_id = str(child_chunk_id)
-        child_chunk = (
-            db.session.query(ChildChunk)
+        child_chunk = db.session.scalar(
+            select(ChildChunk)
            .where(
                ChildChunk.id == str(child_chunk_id),
                ChildChunk.tenant_id == current_tenant_id,
                ChildChunk.segment_id == segment.id,
                ChildChunk.document_id == document_id,
            )
-            .first()
+            .limit(1)
        )
        if not child_chunk:
            raise NotFound("Child chunk not found.")