Merge tag '0.15.7' into e-260

0.15.7
2026-05-03 17:08:03 +08:00 · 2025-04-28 17:17:26 +08:00
parent 2fce4a338c 5f7f851b17
commit 849994d35e
26 changed files with 319 additions and 215 deletions
--- a/api/core/agent/cot_agent_runner.py
+++ b/api/core/agent/cot_agent_runner.py
@ -104,7 +104,6 @@ class CotAgentRunner(BaseAgentRunner, ABC):

            # recalc llm max tokens
            prompt_messages = self._organize_prompt_messages()
-            self.recalc_llm_max_tokens(self.model_config, prompt_messages)
            # invoke model
            chunks = model_instance.invoke_llm(
                prompt_messages=prompt_messages,
--- a/api/core/agent/fc_agent_runner.py
+++ b/api/core/agent/fc_agent_runner.py
@ -84,7 +84,6 @@ class FunctionCallAgentRunner(BaseAgentRunner):

            # recalc llm max tokens
            prompt_messages = self._organize_prompt_messages()
-            self.recalc_llm_max_tokens(self.model_config, prompt_messages)
            # invoke model
            chunks: Union[Generator[LLMResultChunk, None, None], LLMResult] = model_instance.invoke_llm(
                prompt_messages=prompt_messages,
--- a/api/core/app/apps/agent_chat/app_runner.py
+++ b/api/core/app/apps/agent_chat/app_runner.py
@ -55,20 +55,6 @@ class AgentChatAppRunner(AppRunner):
        query = application_generate_entity.query
        files = application_generate_entity.files

-        # Pre-calculate the number of tokens of the prompt messages,
-        # and return the rest number of tokens by model context token size limit and max token size limit.
-        # If the rest number of tokens is not enough, raise exception.
-        # Include: prompt template, inputs, query(optional), files(optional)
-        # Not Include: memory, external data, dataset context
-        self.get_pre_calculate_rest_tokens(
-            app_record=app_record,
-            model_config=application_generate_entity.model_conf,
-            prompt_template_entity=app_config.prompt_template,
-            inputs=inputs,
-            files=files,
-            query=query,
-        )
-
        memory = None
        if application_generate_entity.conversation_id:
            # get memory of conversation (read-only)
--- a/api/core/app/apps/base_app_runner.py
+++ b/api/core/app/apps/base_app_runner.py
@ -15,10 +15,8 @@ from core.app.features.annotation_reply.annotation_reply import AnnotationReplyF
 from core.app.features.hosting_moderation.hosting_moderation import HostingModerationFeature
 from core.external_data_tool.external_data_fetch import ExternalDataFetch
 from core.memory.token_buffer_memory import TokenBufferMemory
-from core.model_manager import ModelInstance
 from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta, LLMUsage
 from core.model_runtime.entities.message_entities import AssistantPromptMessage, PromptMessage
-from core.model_runtime.entities.model_entities import ModelPropertyKey
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.moderation.input_moderation import InputModeration
 from core.prompt.advanced_prompt_transform import AdvancedPromptTransform
@ -31,106 +29,6 @@ if TYPE_CHECKING:


 class AppRunner:
-    def get_pre_calculate_rest_tokens(
-        self,
-        app_record: App,
-        model_config: ModelConfigWithCredentialsEntity,
-        prompt_template_entity: PromptTemplateEntity,
-        inputs: Mapping[str, str],
-        files: Sequence["File"],
-        query: Optional[str] = None,
-    ) -> int:
-        """
-        Get pre calculate rest tokens
-        :param app_record: app record
-        :param model_config: model config entity
-        :param prompt_template_entity: prompt template entity
-        :param inputs: inputs
-        :param files: files
-        :param query: query
-        :return:
-        """
-        # Invoke model
-        model_instance = ModelInstance(
-            provider_model_bundle=model_config.provider_model_bundle, model=model_config.model
-        )
-
-        model_context_tokens = model_config.model_schema.model_properties.get(ModelPropertyKey.CONTEXT_SIZE)
-
-        max_tokens = 0
-        for parameter_rule in model_config.model_schema.parameter_rules:
-            if parameter_rule.name == "max_tokens" or (
-                parameter_rule.use_template and parameter_rule.use_template == "max_tokens"
-            ):
-                max_tokens = (
-                    model_config.parameters.get(parameter_rule.name)
-                    or model_config.parameters.get(parameter_rule.use_template or "")
-                ) or 0
-
-        if model_context_tokens is None:
-            return -1
-
-        if max_tokens is None:
-            max_tokens = 0
-
-        # get prompt messages without memory and context
-        prompt_messages, stop = self.organize_prompt_messages(
-            app_record=app_record,
-            model_config=model_config,
-            prompt_template_entity=prompt_template_entity,
-            inputs=inputs,
-            files=files,
-            query=query,
-        )
-
-        prompt_tokens = model_instance.get_llm_num_tokens(prompt_messages)
-
-        rest_tokens: int = model_context_tokens - max_tokens - prompt_tokens
-        if rest_tokens < 0:
-            raise InvokeBadRequestError(
-                "Query or prefix prompt is too long, you can reduce the prefix prompt, "
-                "or shrink the max token, or switch to a llm with a larger token limit size."
-            )
-
-        return rest_tokens
-
-    def recalc_llm_max_tokens(
-        self, model_config: ModelConfigWithCredentialsEntity, prompt_messages: list[PromptMessage]
-    ):
-        # recalc max_tokens if sum(prompt_token +  max_tokens) over model token limit
-        model_instance = ModelInstance(
-            provider_model_bundle=model_config.provider_model_bundle, model=model_config.model
-        )
-
-        model_context_tokens = model_config.model_schema.model_properties.get(ModelPropertyKey.CONTEXT_SIZE)
-
-        max_tokens = 0
-        for parameter_rule in model_config.model_schema.parameter_rules:
-            if parameter_rule.name == "max_tokens" or (
-                parameter_rule.use_template and parameter_rule.use_template == "max_tokens"
-            ):
-                max_tokens = (
-                    model_config.parameters.get(parameter_rule.name)
-                    or model_config.parameters.get(parameter_rule.use_template or "")
-                ) or 0
-
-        if model_context_tokens is None:
-            return -1
-
-        if max_tokens is None:
-            max_tokens = 0
-
-        prompt_tokens = model_instance.get_llm_num_tokens(prompt_messages)
-
-        if prompt_tokens + max_tokens > model_context_tokens:
-            max_tokens = max(model_context_tokens - prompt_tokens, 16)
-
-            for parameter_rule in model_config.model_schema.parameter_rules:
-                if parameter_rule.name == "max_tokens" or (
-                    parameter_rule.use_template and parameter_rule.use_template == "max_tokens"
-                ):
-                    model_config.parameters[parameter_rule.name] = max_tokens
-
    def organize_prompt_messages(
        self,
        app_record: App,
--- a/api/core/app/apps/chat/app_runner.py
+++ b/api/core/app/apps/chat/app_runner.py
@ -50,20 +50,6 @@ class ChatAppRunner(AppRunner):
        query = application_generate_entity.query
        files = application_generate_entity.files

-        # Pre-calculate the number of tokens of the prompt messages,
-        # and return the rest number of tokens by model context token size limit and max token size limit.
-        # If the rest number of tokens is not enough, raise exception.
-        # Include: prompt template, inputs, query(optional), files(optional)
-        # Not Include: memory, external data, dataset context
-        self.get_pre_calculate_rest_tokens(
-            app_record=app_record,
-            model_config=application_generate_entity.model_conf,
-            prompt_template_entity=app_config.prompt_template,
-            inputs=inputs,
-            files=files,
-            query=query,
-        )
-
        memory = None
        if application_generate_entity.conversation_id:
            # get memory of conversation (read-only)
@ -194,9 +180,6 @@ class ChatAppRunner(AppRunner):
        if hosting_moderation_result:
            return

-        # Re-calculate the max tokens if sum(prompt_token +  max_tokens) over model token limit
-        self.recalc_llm_max_tokens(model_config=application_generate_entity.model_conf, prompt_messages=prompt_messages)
-
        # Invoke model
        model_instance = ModelInstance(
            provider_model_bundle=application_generate_entity.model_conf.provider_model_bundle,
--- a/api/core/app/apps/completion/app_runner.py
+++ b/api/core/app/apps/completion/app_runner.py
@ -43,20 +43,6 @@ class CompletionAppRunner(AppRunner):
        query = application_generate_entity.query
        files = application_generate_entity.files

-        # Pre-calculate the number of tokens of the prompt messages,
-        # and return the rest number of tokens by model context token size limit and max token size limit.
-        # If the rest number of tokens is not enough, raise exception.
-        # Include: prompt template, inputs, query(optional), files(optional)
-        # Not Include: memory, external data, dataset context
-        self.get_pre_calculate_rest_tokens(
-            app_record=app_record,
-            model_config=application_generate_entity.model_conf,
-            prompt_template_entity=app_config.prompt_template,
-            inputs=inputs,
-            files=files,
-            query=query,
-        )
-
        # organize all inputs and template to prompt messages
        # Include: prompt template, inputs, query(optional), files(optional)
        prompt_messages, stop = self.organize_prompt_messages(
@ -152,9 +138,6 @@ class CompletionAppRunner(AppRunner):
        if hosting_moderation_result:
            return

-        # Re-calculate the max tokens if sum(prompt_token +  max_tokens) over model token limit
-        self.recalc_llm_max_tokens(model_config=application_generate_entity.model_conf, prompt_messages=prompt_messages)
-
        # Invoke model
        model_instance = ModelInstance(
            provider_model_bundle=application_generate_entity.model_conf.provider_model_bundle,
--- a/api/core/memory/token_buffer_memory.py
+++ b/api/core/memory/token_buffer_memory.py
@ -26,7 +26,7 @@ class TokenBufferMemory:
        self.model_instance = model_instance

    def get_history_prompt_messages(
-        self, max_token_limit: int = 2000, message_limit: Optional[int] = None
+        self, max_token_limit: int = 100000, message_limit: Optional[int] = None
    ) -> Sequence[PromptMessage]:
        """
        Get history prompt messages.
--- a/api/core/model_runtime/model_providers/bedrock/llm/eu.anthropic.claude-3.7-sonnet-v1.yaml
+++ b/api/core/model_runtime/model_providers/bedrock/llm/eu.anthropic.claude-3.7-sonnet-v1.yaml
@ -0,0 +1,115 @@
+model: us.anthropic.claude-3-7-sonnet-20250219-v1:0
+label:
+  en_US: Claude 3.7 Sonnet(US.Cross Region Inference)
+icon: icon_s_en.svg
+model_type: llm
+features:
+  - agent-thought
+  - vision
+  - tool-call
+  - stream-tool-call
+model_properties:
+  mode: chat
+  context_size: 200000
+# docs: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages.html
+parameter_rules:
+  - name: enable_cache
+    label:
+      zh_Hans: 启用提示缓存
+      en_US: Enable Prompt Cache
+    type: boolean
+    required: false
+    default: true
+    help:
+      zh_Hans: 启用提示缓存可以提高性能并降低成本。Claude 3.7 Sonnet支持在system、messages和tools字段中使用缓存检查点。
+      en_US: Enable prompt caching to improve performance and reduce costs. Claude 3.7 Sonnet supports cache checkpoints in system, messages, and tools fields.
+  - name: reasoning_type
+    label:
+      zh_Hans: 推理配置
+      en_US: Reasoning Type
+    type: boolean
+    required: false
+    default: false
+    placeholder:
+      zh_Hans: 设置推理配置
+      en_US: Set reasoning configuration
+    help:
+      zh_Hans: 控制模型的推理能力。启用时，temperature将固定为1且top_p将被禁用。
+      en_US: Controls the model's reasoning capability. When enabled, temperature will be fixed to 1 and top_p will be disabled.
+  - name: reasoning_budget
+    show_on:
+      - variable: reasoning_type
+        value: true
+    label:
+      zh_Hans: 推理预算
+      en_US: Reasoning Budget
+    type: int
+    default: 1024
+    min: 0
+    max: 128000
+    help:
+      zh_Hans: 推理的预算限制（最小1024），必须小于max_tokens。仅在推理类型为enabled时可用。
+      en_US: Budget limit for reasoning (minimum 1024), must be less than max_tokens. Only available when reasoning type is enabled.
+
+  - name: max_tokens
+    use_template: max_tokens
+    required: true
+    label:
+      zh_Hans: 最大token数
+      en_US: Max Tokens
+    type: int
+    default: 8192
+    min: 1
+    max: 128000
+    help:
+      zh_Hans: 停止前生成的最大令牌数。请注意，Anthropic Claude 模型可能会在达到 max_tokens 的值之前停止生成令牌。不同的 Anthropic Claude 模型对此参数具有不同的最大值。
+      en_US: The maximum number of tokens to generate before stopping. Note that Anthropic Claude models might stop generating tokens before reaching the value of max_tokens. Different Anthropic Claude models have different maximum values for this parameter.
+  - name: temperature
+    use_template: temperature
+    required: false
+    label:
+      zh_Hans: 模型温度
+      en_US: Model Temperature
+    type: float
+    default: 1
+    min: 0.0
+    max: 1.0
+    help:
+      zh_Hans: 生成内容的随机性。当推理功能启用时，该值将被固定为1。
+      en_US: The amount of randomness injected into the response. When reasoning is enabled, this value will be fixed to 1.
+  - name: top_p
+    show_on:
+      - variable: reasoning_type
+        value: disabled
+    use_template: top_p
+    label:
+      zh_Hans: Top P
+      en_US: Top P
+    required: false
+    type: float
+    default: 0.999
+    min: 0.000
+    max: 1.000
+    help:
+      zh_Hans: 在核采样中的概率阈值。当推理功能启用时，该参数将被禁用。
+      en_US: The probability threshold in nucleus sampling. When reasoning is enabled, this parameter will be disabled.
+  - name: top_k
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    required: false
+    type: int
+    default: 0
+    min: 0
+    # tip docs from aws has error, max value is 500
+    max: 500
+    help:
+      zh_Hans: 对于每个后续标记，仅从前 K 个选项中进行采样。使用 top_k 删除长尾低概率响应。
+      en_US: Only sample from the top K options for each subsequent token. Use top_k to remove long tail low probability responses.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.003'
+  output: '0.015'
+  unit: '0.001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/bedrock/llm/llm.py
+++ b/api/core/model_runtime/model_providers/bedrock/llm/llm.py
@ -58,6 +58,7 @@ class BedrockLargeLanguageModel(LargeLanguageModel):
    # TODO There is invoke issue: context limit on Cohere Model, will add them after fixed.
    CONVERSE_API_ENABLED_MODEL_INFO = [
        {"prefix": "anthropic.claude-v2", "support_system_prompts": True, "support_tool_use": False},
+        {"prefix": "us.deepseek", "support_system_prompts": True, "support_tool_use": False},
        {"prefix": "anthropic.claude-v1", "support_system_prompts": True, "support_tool_use": False},
        {"prefix": "us.anthropic.claude-3", "support_system_prompts": True, "support_tool_use": True},
        {"prefix": "eu.anthropic.claude-3", "support_system_prompts": True, "support_tool_use": True},
--- a/api/core/model_runtime/model_providers/bedrock/llm/us.deepseek-r1.yaml
+++ b/api/core/model_runtime/model_providers/bedrock/llm/us.deepseek-r1.yaml
@ -0,0 +1,63 @@
+model: us.deepseek.r1-v1:0
+label:
+  en_US: DeepSeek-R1(US.Cross Region Inference)
+icon: icon_s_en.svg
+model_type: llm
+features:
+  - agent-thought
+  - vision
+  - tool-call
+  - stream-tool-call
+model_properties:
+  mode: chat
+  context_size: 32768
+parameter_rules:
+  - name: max_tokens
+    use_template: max_tokens
+    required: true
+    label:
+      zh_Hans: 最大token数
+      en_US: Max Tokens
+    type: int
+    default: 8192
+    min: 1
+    max: 128000
+    help:
+      zh_Hans: 停止前生成的最大令牌数。
+      en_US: The maximum number of tokens to generate before stopping.
+  - name: temperature
+    use_template: temperature
+    required: false
+    label:
+      zh_Hans: 模型温度
+      en_US: Model Temperature
+    type: float
+    default: 1
+    min: 0.0
+    max: 1.0
+    help:
+      zh_Hans: 生成内容的随机性。当推理功能启用时，该值将被固定为1。
+      en_US: The amount of randomness injected into the response. When reasoning is enabled, this value will be fixed to 1.
+  - name: top_p
+    show_on:
+      - variable: reasoning_type
+        value: disabled
+    use_template: top_p
+    label:
+      zh_Hans: Top P
+      en_US: Top P
+    required: false
+    type: float
+    default: 0.999
+    min: 0.000
+    max: 1.000
+    help:
+      zh_Hans: 在核采样中的概率阈值。当推理功能启用时，该参数将被禁用。
+      en_US: The probability threshold in nucleus sampling. When reasoning is enabled, this parameter will be disabled.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.001'
+  output: '0.005'
+  unit: '0.001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/openai/llm/_position.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/_position.yaml
@ -1,3 +1,4 @@
+- gpt-4.1
 - o1
 - o1-2024-12-17
 - o1-mini
--- a/api/core/model_runtime/model_providers/openai/llm/gpt-4-1.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4-1.yaml
@ -0,0 +1,60 @@
+model: gpt-4.1
+label:
+  zh_Hans: gpt-4.1
+  en_US: gpt-4.1
+model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
+  - vision
+model_properties:
+  mode: chat
+  context_size: 1047576
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: presence_penalty
+    use_template: presence_penalty
+  - name: frequency_penalty
+    use_template: frequency_penalty
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 32768
+  - name: reasoning_effort
+    label:
+      zh_Hans: 推理工作
+      en_US: Reasoning Effort
+    type: string
+    help:
+      zh_Hans: 限制推理模型的推理工作
+      en_US: Constrains effort on reasoning for reasoning models
+    required: false
+    options:
+      - low
+      - medium
+      - high
+  - name: response_format
+    label:
+      zh_Hans: 回复格式
+      en_US: Response Format
+    type: string
+    help:
+      zh_Hans: 指定模型必须输出的格式
+      en_US: specifying the format that the model must output
+    required: false
+    options:
+      - text
+      - json_object
+      - json_schema
+  - name: json_schema
+    use_template: json_schema
+pricing:
+  input: '2.00'
+  output: '8.00'
+  unit: '0.000001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/openai/llm/llm.py
+++ b/api/core/model_runtime/model_providers/openai/llm/llm.py
@ -1057,7 +1057,7 @@ class OpenAILargeLanguageModel(_CommonOpenAI, LargeLanguageModel):
            model = "gpt-4o"

        try:
-            encoding = tiktoken.encoding_for_model(model)
+            encoding = tiktoken.get_encoding(model)
        except KeyError:
            logger.warning("Warning: model not found. Using cl100k_base encoding.")
            model = "cl100k_base"
--- a/api/core/workflow/nodes/code/code_node.py
+++ b/api/core/workflow/nodes/code/code_node.py
@ -195,7 +195,7 @@ class CodeNode(BaseNode[CodeNodeData]):
            if output_config.type == "object":
                # check if output is object
                if not isinstance(result.get(output_name), dict):
-                    if isinstance(result.get(output_name), type(None)):
+                    if result.get(output_name) is None:
                        transformed_result[output_name] = None
                    else:
                        raise OutputValidationError(
@ -223,7 +223,7 @@ class CodeNode(BaseNode[CodeNodeData]):
            elif output_config.type == "array[number]":
                # check if array of number available
                if not isinstance(result[output_name], list):
-                    if isinstance(result[output_name], type(None)):
+                    if result[output_name] is None:
                        transformed_result[output_name] = None
                    else:
                        raise OutputValidationError(
@ -244,7 +244,7 @@ class CodeNode(BaseNode[CodeNodeData]):
            elif output_config.type == "array[string]":
                # check if array of string available
                if not isinstance(result[output_name], list):
-                    if isinstance(result[output_name], type(None)):
+                    if result[output_name] is None:
                        transformed_result[output_name] = None
                    else:
                        raise OutputValidationError(
@ -265,7 +265,7 @@ class CodeNode(BaseNode[CodeNodeData]):
            elif output_config.type == "array[object]":
                # check if array of object available
                if not isinstance(result[output_name], list):
-                    if isinstance(result[output_name], type(None)):
+                    if result[output_name] is None:
                        transformed_result[output_name] = None
                    else:
                        raise OutputValidationError(
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@ -968,14 +968,12 @@ def _handle_memory_chat_mode(
    *,
    memory: TokenBufferMemory | None,
    memory_config: MemoryConfig | None,
-    model_config: ModelConfigWithCredentialsEntity,
+    model_config: ModelConfigWithCredentialsEntity,  # TODO(-LAN-): Needs to remove
 ) -> Sequence[PromptMessage]:
    memory_messages: Sequence[PromptMessage] = []
    # Get messages from memory for chat model
    if memory and memory_config:
-        rest_tokens = _calculate_rest_token(prompt_messages=[], model_config=model_config)
        memory_messages = memory.get_history_prompt_messages(
-            max_token_limit=rest_tokens,
            message_limit=memory_config.window.size if memory_config.window.enabled else None,
        )
    return memory_messages