Merge branch 'main' into feat/webapp-verified-sso-main

This commit is contained in:
GareArc
2025-06-09 16:55:31 +09:00
247 changed files with 4124 additions and 1995 deletions

View File

@ -491,3 +491,10 @@ OTEL_METRIC_EXPORT_TIMEOUT=30000
# Prevent Clickjacking
ALLOW_EMBED=false
# Dataset queue monitor configuration
QUEUE_MONITOR_THRESHOLD=200
# You can configure multiple ones, separated by commas. eg: test1@dify.ai,test2@dify.ai
QUEUE_MONITOR_ALERT_EMAILS=
# Monitor interval in minutes, default is 30 minutes
QUEUE_MONITOR_INTERVAL=30

View File

@ -43,6 +43,7 @@ select = [
"S307", # suspicious-eval-usage, disallow use of `eval` and `ast.literal_eval`
"S301", # suspicious-pickle-usage, disallow use of `pickle` and its wrappers.
"S302", # suspicious-marshal-usage, disallow use of `marshal` module
"S311", # suspicious-non-cryptographic-random-usage
]
ignore = [

View File

@ -4,7 +4,7 @@ FROM python:3.12-slim-bookworm AS base
WORKDIR /app/api
# Install uv
ENV UV_VERSION=0.6.14
ENV UV_VERSION=0.7.11
RUN pip install --no-cache-dir uv==${UV_VERSION}

View File

@ -2,7 +2,7 @@ import os
from typing import Any, Literal, Optional
from urllib.parse import parse_qsl, quote_plus
from pydantic import Field, NonNegativeInt, PositiveFloat, PositiveInt, computed_field
from pydantic import Field, NonNegativeFloat, NonNegativeInt, PositiveFloat, PositiveInt, computed_field
from pydantic_settings import BaseSettings
from .cache.redis_config import RedisConfig
@ -256,6 +256,25 @@ class InternalTestConfig(BaseSettings):
)
class DatasetQueueMonitorConfig(BaseSettings):
"""
Configuration settings for Dataset Queue Monitor
"""
QUEUE_MONITOR_THRESHOLD: Optional[NonNegativeInt] = Field(
description="Threshold for dataset queue monitor",
default=200,
)
QUEUE_MONITOR_ALERT_EMAILS: Optional[str] = Field(
description="Emails for dataset queue monitor alert, separated by commas",
default=None,
)
QUEUE_MONITOR_INTERVAL: Optional[NonNegativeFloat] = Field(
description="Interval for dataset queue monitor in minutes",
default=30,
)
class MiddlewareConfig(
# place the configs in alphabet order
CeleryConfig,
@ -303,5 +322,6 @@ class MiddlewareConfig(
BaiduVectorDBConfig,
OpenGaussConfig,
TableStoreConfig,
DatasetQueueMonitorConfig,
):
pass

View File

@ -32,6 +32,7 @@ def get_user(tenant_id: str, user_id: str | None) -> Account | EndUser:
)
session.add(user_model)
session.commit()
session.refresh(user_model)
else:
user_model = AccountService.load_user(user_id)
if not user_model:

View File

@ -369,6 +369,7 @@ class DatasetTagsApi(DatasetApiResource):
)
parser.add_argument("tag_id", nullable=False, required=True, help="Id of a tag.", type=str)
args = parser.parse_args()
args["type"] = "knowledge"
tag = TagService.update_tags(args, args.get("tag_id"))
binding_count = TagService.get_tag_binding_count(args.get("tag_id"))

View File

@ -175,8 +175,11 @@ class DocumentAddByFileApi(DatasetApiResource):
if not dataset:
raise ValueError("Dataset does not exist.")
if not dataset.indexing_technique and not args.get("indexing_technique"):
indexing_technique = args.get("indexing_technique") or dataset.indexing_technique
if not indexing_technique:
raise ValueError("indexing_technique is required.")
args["indexing_technique"] = indexing_technique
# save file info
file = request.files["file"]
@ -206,12 +209,16 @@ class DocumentAddByFileApi(DatasetApiResource):
knowledge_config = KnowledgeConfig(**args)
DocumentService.document_create_args_validate(knowledge_config)
dataset_process_rule = dataset.latest_process_rule if "process_rule" not in args else None
if not knowledge_config.original_document_id and not dataset_process_rule and not knowledge_config.process_rule:
raise ValueError("process_rule is required.")
try:
documents, batch = DocumentService.save_document_with_dataset_id(
dataset=dataset,
knowledge_config=knowledge_config,
account=dataset.created_by_account,
dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None,
dataset_process_rule=dataset_process_rule,
created_from="api",
)
except ProviderTokenNotInitError as ex:

View File

@ -55,6 +55,25 @@ class ProviderModelWithStatusEntity(ProviderModel):
status: ModelStatus
load_balancing_enabled: bool = False
def raise_for_status(self) -> None:
"""
Check model status and raise ValueError if not active.
:raises ValueError: When model status is not active, with a descriptive message
"""
if self.status == ModelStatus.ACTIVE:
return
error_messages = {
ModelStatus.NO_CONFIGURE: "Model is not configured",
ModelStatus.QUOTA_EXCEEDED: "Model quota has been exceeded",
ModelStatus.NO_PERMISSION: "No permission to use this model",
ModelStatus.DISABLED: "Model is disabled",
}
if self.status in error_messages:
raise ValueError(error_messages[self.status])
class ModelWithProviderEntity(ProviderModelWithStatusEntity):
"""

View File

@ -41,45 +41,53 @@ class Extensible:
extensions = []
position_map: dict[str, int] = {}
# get the path of the current class
current_path = os.path.abspath(cls.__module__.replace(".", os.path.sep) + ".py")
current_dir_path = os.path.dirname(current_path)
# Get the package name from the module path
package_name = ".".join(cls.__module__.split(".")[:-1])
# traverse subdirectories
for subdir_name in os.listdir(current_dir_path):
if subdir_name.startswith("__"):
continue
try:
# Get package directory path
package_spec = importlib.util.find_spec(package_name)
if not package_spec or not package_spec.origin:
raise ImportError(f"Could not find package {package_name}")
subdir_path = os.path.join(current_dir_path, subdir_name)
extension_name = subdir_name
if os.path.isdir(subdir_path):
package_dir = os.path.dirname(package_spec.origin)
# Traverse subdirectories
for subdir_name in os.listdir(package_dir):
if subdir_name.startswith("__"):
continue
subdir_path = os.path.join(package_dir, subdir_name)
if not os.path.isdir(subdir_path):
continue
extension_name = subdir_name
file_names = os.listdir(subdir_path)
# is builtin extension, builtin extension
# in the front-end page and business logic, there are special treatments.
# Check for extension module file
if (extension_name + ".py") not in file_names:
logging.warning(f"Missing {extension_name}.py file in {subdir_path}, Skip.")
continue
# Check for builtin flag and position
builtin = False
# default position is 0 can not be None for sort_to_dict_by_position_map
position = 0
if "__builtin__" in file_names:
builtin = True
builtin_file_path = os.path.join(subdir_path, "__builtin__")
if os.path.exists(builtin_file_path):
position = int(Path(builtin_file_path).read_text(encoding="utf-8").strip())
position_map[extension_name] = position
if (extension_name + ".py") not in file_names:
logging.warning(f"Missing {extension_name}.py file in {subdir_path}, Skip.")
continue
# Dynamic loading {subdir_name}.py file and find the subclass of Extensible
py_path = os.path.join(subdir_path, extension_name + ".py")
spec = importlib.util.spec_from_file_location(extension_name, py_path)
# Import the extension module
module_name = f"{package_name}.{extension_name}.{extension_name}"
spec = importlib.util.find_spec(module_name)
if not spec or not spec.loader:
raise Exception(f"Failed to load module {extension_name} from {py_path}")
raise ImportError(f"Failed to load module {module_name}")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
# Find extension class
extension_class = None
for name, obj in vars(mod).items():
if isinstance(obj, type) and issubclass(obj, cls) and obj != cls:
@ -87,21 +95,21 @@ class Extensible:
break
if not extension_class:
logging.warning(f"Missing subclass of {cls.__name__} in {py_path}, Skip.")
logging.warning(f"Missing subclass of {cls.__name__} in {module_name}, Skip.")
continue
# Load schema if not builtin
json_data: dict[str, Any] = {}
if not builtin:
if "schema.json" not in file_names:
json_path = os.path.join(subdir_path, "schema.json")
if not os.path.exists(json_path):
logging.warning(f"Missing schema.json file in {subdir_path}, Skip.")
continue
json_path = os.path.join(subdir_path, "schema.json")
json_data = {}
if os.path.exists(json_path):
with open(json_path, encoding="utf-8") as f:
json_data = json.load(f)
with open(json_path, encoding="utf-8") as f:
json_data = json.load(f)
# Create extension
extensions.append(
ModuleExtension(
extension_class=extension_class,
@ -113,6 +121,11 @@ class Extensible:
)
)
except Exception as e:
logging.exception("Error scanning extensions")
raise
# Sort extensions by position
sorted_extensions = sort_to_dict_by_position_map(
position_map=position_map, data=extensions, name_func=lambda x: x.name
)

View File

@ -1,5 +1,5 @@
import logging
import random
import secrets
from typing import cast
from core.app.entities.app_invoke_entities import ModelConfigWithCredentialsEntity
@ -38,7 +38,7 @@ def check_moderation(tenant_id: str, model_config: ModelConfigWithCredentialsEnt
if len(text_chunks) == 0:
return True
text_chunk = random.choice(text_chunks)
text_chunk = secrets.choice(text_chunks)
try:
model_provider_factory = ModelProviderFactory(tenant_id)

View File

@ -160,6 +160,10 @@ class ProviderModel(BaseModel):
deprecated: bool = False
model_config = ConfigDict(protected_namespaces=())
@property
def support_structure_output(self) -> bool:
return self.features is not None and ModelFeature.STRUCTURED_OUTPUT in self.features
class ParameterRule(BaseModel):
"""

View File

@ -98,6 +98,7 @@ class WeaveConfig(BaseTracingConfig):
entity: str | None = None
project: str
endpoint: str = "https://trace.wandb.ai"
host: str | None = None
@field_validator("endpoint")
@classmethod
@ -109,6 +110,14 @@ class WeaveConfig(BaseTracingConfig):
return v
@field_validator("host")
@classmethod
def validate_host(cls, v, info: ValidationInfo):
if v is not None and v != "":
if not v.startswith(("https://", "http://")):
raise ValueError("host must start with https:// or http://")
return v
OPS_FILE_PATH = "ops_trace/"
OPS_TRACE_FAILED_KEY = "FAILED_OPS_TRACE"

View File

@ -81,7 +81,7 @@ class OpsTraceProviderConfigMap(dict[str, dict[str, Any]]):
return {
"config_class": WeaveConfig,
"secret_keys": ["api_key"],
"other_keys": ["project", "entity", "endpoint"],
"other_keys": ["project", "entity", "endpoint", "host"],
"trace_instance": WeaveDataTrace,
}

View File

@ -40,9 +40,14 @@ class WeaveDataTrace(BaseTraceInstance):
self.weave_api_key = weave_config.api_key
self.project_name = weave_config.project
self.entity = weave_config.entity
self.host = weave_config.host
# Login with API key first, including host if provided
if self.host:
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True, host=self.host)
else:
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True)
# Login with API key first
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True)
if not login_status:
logger.error("Failed to login to Weights & Biases with the provided API key")
raise ValueError("Weave login failed")
@ -386,7 +391,11 @@ class WeaveDataTrace(BaseTraceInstance):
def api_check(self):
try:
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True)
if self.host:
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True, host=self.host)
else:
login_status = wandb.login(key=self.weave_api_key, verify=True, relogin=True)
if not login_status:
raise ValueError("Weave login failed")
else:

View File

@ -3,7 +3,9 @@ from collections import defaultdict
from json import JSONDecodeError
from typing import Any, Optional, cast
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from configs import dify_config
from core.entities.model_entities import DefaultModelEntity, DefaultModelProviderEntity
@ -393,19 +395,13 @@ class ProviderManager:
@staticmethod
def _get_all_providers(tenant_id: str) -> dict[str, list[Provider]]:
"""
Get all provider records of the workspace.
:param tenant_id: workspace id
:return:
"""
providers = db.session.query(Provider).filter(Provider.tenant_id == tenant_id, Provider.is_valid == True).all()
provider_name_to_provider_records_dict = defaultdict(list)
for provider in providers:
# TODO: Use provider name with prefix after the data migration
provider_name_to_provider_records_dict[str(ModelProviderID(provider.provider_name))].append(provider)
with Session(db.engine, expire_on_commit=False) as session:
stmt = select(Provider).where(Provider.tenant_id == tenant_id, Provider.is_valid == True)
providers = session.scalars(stmt)
for provider in providers:
# Use provider name with prefix after the data migration
provider_name_to_provider_records_dict[str(ModelProviderID(provider.provider_name))].append(provider)
return provider_name_to_provider_records_dict
@staticmethod
@ -416,17 +412,12 @@ class ProviderManager:
:param tenant_id: workspace id
:return:
"""
# Get all provider model records of the workspace
provider_models = (
db.session.query(ProviderModel)
.filter(ProviderModel.tenant_id == tenant_id, ProviderModel.is_valid == True)
.all()
)
provider_name_to_provider_model_records_dict = defaultdict(list)
for provider_model in provider_models:
provider_name_to_provider_model_records_dict[provider_model.provider_name].append(provider_model)
with Session(db.engine, expire_on_commit=False) as session:
stmt = select(ProviderModel).where(ProviderModel.tenant_id == tenant_id, ProviderModel.is_valid == True)
provider_models = session.scalars(stmt)
for provider_model in provider_models:
provider_name_to_provider_model_records_dict[provider_model.provider_name].append(provider_model)
return provider_name_to_provider_model_records_dict
@staticmethod
@ -437,17 +428,14 @@ class ProviderManager:
:param tenant_id: workspace id
:return:
"""
preferred_provider_types = (
db.session.query(TenantPreferredModelProvider)
.filter(TenantPreferredModelProvider.tenant_id == tenant_id)
.all()
)
provider_name_to_preferred_provider_type_records_dict = {
preferred_provider_type.provider_name: preferred_provider_type
for preferred_provider_type in preferred_provider_types
}
provider_name_to_preferred_provider_type_records_dict = {}
with Session(db.engine, expire_on_commit=False) as session:
stmt = select(TenantPreferredModelProvider).where(TenantPreferredModelProvider.tenant_id == tenant_id)
preferred_provider_types = session.scalars(stmt)
provider_name_to_preferred_provider_type_records_dict = {
preferred_provider_type.provider_name: preferred_provider_type
for preferred_provider_type in preferred_provider_types
}
return provider_name_to_preferred_provider_type_records_dict
@staticmethod
@ -458,18 +446,14 @@ class ProviderManager:
:param tenant_id: workspace id
:return:
"""
provider_model_settings = (
db.session.query(ProviderModelSetting).filter(ProviderModelSetting.tenant_id == tenant_id).all()
)
provider_name_to_provider_model_settings_dict = defaultdict(list)
for provider_model_setting in provider_model_settings:
(
with Session(db.engine, expire_on_commit=False) as session:
stmt = select(ProviderModelSetting).where(ProviderModelSetting.tenant_id == tenant_id)
provider_model_settings = session.scalars(stmt)
for provider_model_setting in provider_model_settings:
provider_name_to_provider_model_settings_dict[provider_model_setting.provider_name].append(
provider_model_setting
)
)
return provider_name_to_provider_model_settings_dict
@staticmethod
@ -492,15 +476,14 @@ class ProviderManager:
if not model_load_balancing_enabled:
return {}
provider_load_balancing_configs = (
db.session.query(LoadBalancingModelConfig).filter(LoadBalancingModelConfig.tenant_id == tenant_id).all()
)
provider_name_to_provider_load_balancing_model_configs_dict = defaultdict(list)
for provider_load_balancing_config in provider_load_balancing_configs:
provider_name_to_provider_load_balancing_model_configs_dict[
provider_load_balancing_config.provider_name
].append(provider_load_balancing_config)
with Session(db.engine, expire_on_commit=False) as session:
stmt = select(LoadBalancingModelConfig).where(LoadBalancingModelConfig.tenant_id == tenant_id)
provider_load_balancing_configs = session.scalars(stmt)
for provider_load_balancing_config in provider_load_balancing_configs:
provider_name_to_provider_load_balancing_model_configs_dict[
provider_load_balancing_config.provider_name
].append(provider_load_balancing_config)
return provider_name_to_provider_load_balancing_model_configs_dict
@ -626,10 +609,9 @@ class ProviderManager:
if not cached_provider_credentials:
try:
# fix origin data
if (
custom_provider_record.encrypted_config
and not custom_provider_record.encrypted_config.startswith("{")
):
if custom_provider_record.encrypted_config is None:
raise ValueError("No credentials found")
if not custom_provider_record.encrypted_config.startswith("{"):
provider_credentials = {"openai_api_key": custom_provider_record.encrypted_config}
else:
provider_credentials = json.loads(custom_provider_record.encrypted_config)
@ -733,7 +715,7 @@ class ProviderManager:
return SystemConfiguration(enabled=False)
# Convert provider_records to dict
quota_type_to_provider_records_dict = {}
quota_type_to_provider_records_dict: dict[ProviderQuotaType, Provider] = {}
for provider_record in provider_records:
if provider_record.provider_type != ProviderType.SYSTEM.value:
continue
@ -758,6 +740,11 @@ class ProviderManager:
else:
provider_record = quota_type_to_provider_records_dict[provider_quota.quota_type]
if provider_record.quota_used is None:
raise ValueError("quota_used is None")
if provider_record.quota_limit is None:
raise ValueError("quota_limit is None")
quota_configuration = QuotaConfiguration(
quota_type=provider_quota.quota_type,
quota_unit=provider_hosting_configuration.quota_unit or QuotaUnit.TOKENS,
@ -791,10 +778,9 @@ class ProviderManager:
cached_provider_credentials = provider_credentials_cache.get()
if not cached_provider_credentials:
try:
provider_credentials: dict[str, Any] = json.loads(provider_record.encrypted_config)
except JSONDecodeError:
provider_credentials = {}
provider_credentials: dict[str, Any] = {}
if provider_records and provider_records[0].encrypted_config:
provider_credentials = json.loads(provider_records[0].encrypted_config)
# Get provider credential secret variables
provider_credential_secret_variables = self._extract_secret_variables(

View File

@ -720,7 +720,7 @@ STOPWORDS = {
"",
"",
"",
" ",
" ",
"0",
"1",
"2",
@ -731,16 +731,6 @@ STOPWORDS = {
"7",
"8",
"9",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",

View File

@ -184,7 +184,16 @@ class OpenSearchVector(BaseVector):
}
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
query["query"] = {"terms": {"metadata.document_id": document_ids_filter}}
query["query"] = {
"script_score": {
"query": {"bool": {"filter": [{"terms": {Field.DOCUMENT_ID.value: document_ids_filter}}]}},
"script": {
"source": "knn_score",
"lang": "knn",
"params": {"field": Field.VECTOR.value, "query_value": query_vector, "space_type": "l2"},
},
}
}
try:
response = self._client.search(index=self._collection_name.lower(), body=query)
@ -209,10 +218,10 @@ class OpenSearchVector(BaseVector):
return docs
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
full_text_query = {"query": {"match": {Field.CONTENT_KEY.value: query}}}
full_text_query = {"query": {"bool": {"must": [{"match": {Field.CONTENT_KEY.value: query}}]}}}
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
full_text_query["query"]["terms"] = {"metadata.document_id": document_ids_filter}
full_text_query["query"]["bool"]["filter"] = [{"terms": {"metadata.document_id": document_ids_filter}}]
response = self._client.search(index=self._collection_name.lower(), body=full_text_query)
@ -255,7 +264,8 @@ class OpenSearchVector(BaseVector):
Field.METADATA_KEY.value: {
"type": "object",
"properties": {
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
"document_id": {"type": "keyword"},
},
},
}

View File

@ -261,7 +261,7 @@ class OracleVector(BaseVector):
words = pseg.cut(query)
current_entity = ""
for word, pos in words:
if pos in {"nr", "Ng", "eng", "nz", "n", "ORG", "v"}: # nr: 人名, ns: 地名, nt: 机构名
if pos in {"nr", "Ng", "eng", "nz", "n", "ORG", "v"}: # nr: 人名ns: 地名nt: 机构名
current_entity += word
else:
if current_entity:
@ -303,7 +303,6 @@ class OracleVector(BaseVector):
return docs
else:
return [Document(page_content="", metadata={})]
return []
def delete(self) -> None:
with self._get_connection() as conn:

View File

@ -1,3 +1,4 @@
- audio
- code
- time
- qrcode
- webscraper

View File

@ -153,8 +153,6 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool):
return str("\n".join(document_context_list))
return ""
raise RuntimeError("not segments found")
def _retriever(
self,
flask_app: Flask,

View File

@ -397,19 +397,44 @@ def _extract_text_from_csv(file_content: bytes) -> str:
if not rows:
return ""
# Create Markdown table
markdown_table = "| " + " | ".join(rows[0]) + " |\n"
markdown_table += "| " + " | ".join(["---"] * len(rows[0])) + " |\n"
for row in rows[1:]:
markdown_table += "| " + " | ".join(row) + " |\n"
# Combine multi-line text in the header row
header_row = [cell.replace("\n", " ").replace("\r", "") for cell in rows[0]]
return markdown_table.strip()
# Create Markdown table
markdown_table = "| " + " | ".join(header_row) + " |\n"
markdown_table += "| " + " | ".join(["-" * len(col) for col in rows[0]]) + " |\n"
# Process each data row and combine multi-line text in each cell
for row in rows[1:]:
processed_row = [cell.replace("\n", " ").replace("\r", "") for cell in row]
markdown_table += "| " + " | ".join(processed_row) + " |\n"
return markdown_table
except Exception as e:
raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
def _extract_text_from_excel(file_content: bytes) -> str:
"""Extract text from an Excel file using pandas."""
def _construct_markdown_table(df: pd.DataFrame) -> str:
"""Manually construct a Markdown table from a DataFrame."""
# Construct the header row
header_row = "| " + " | ".join(df.columns) + " |"
# Construct the separator row
separator_row = "| " + " | ".join(["-" * len(col) for col in df.columns]) + " |"
# Construct the data rows
data_rows = []
for _, row in df.iterrows():
data_row = "| " + " | ".join(map(str, row)) + " |"
data_rows.append(data_row)
# Combine all rows into a single string
markdown_table = "\n".join([header_row, separator_row] + data_rows)
return markdown_table
try:
excel_file = pd.ExcelFile(io.BytesIO(file_content))
markdown_table = ""
@ -417,8 +442,15 @@ def _extract_text_from_excel(file_content: bytes) -> str:
try:
df = excel_file.parse(sheet_name=sheet_name)
df.dropna(how="all", inplace=True)
# Create Markdown table two times to separate tables with a newline
markdown_table += df.to_markdown(index=False, floatfmt="") + "\n\n"
# Combine multi-line text in each cell into a single line
df = df.applymap(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x) # type: ignore
# Combine multi-line text in column names into a single line
df.columns = pd.Index([" ".join(col.splitlines()) for col in df.columns])
# Manually construct the Markdown table
markdown_table += _construct_markdown_table(df) + "\n\n"
except Exception as e:
continue
return markdown_table

View File

@ -1,8 +1,9 @@
import base64
import json
import secrets
import string
from collections.abc import Mapping
from copy import deepcopy
from random import randint
from typing import Any, Literal
from urllib.parse import urlencode, urlparse
@ -434,4 +435,4 @@ def _generate_random_string(n: int) -> str:
>>> _generate_random_string(5)
'abcde'
"""
return "".join([chr(randint(97, 122)) for _ in range(n)])
return "".join(secrets.choice(string.ascii_lowercase) for _ in range(n))

View File

@ -66,7 +66,8 @@ class LLMNodeData(BaseNodeData):
context: ContextConfig
vision: VisionConfig = Field(default_factory=VisionConfig)
structured_output: dict | None = None
structured_output_enabled: bool = False
# We used 'structured_output_enabled' in the past, but it's not a good name.
structured_output_switch_on: bool = Field(False, alias="structured_output_enabled")
@field_validator("prompt_config", mode="before")
@classmethod
@ -74,3 +75,7 @@ class LLMNodeData(BaseNodeData):
if v is None:
return PromptConfig()
return v
@property
def structured_output_enabled(self) -> bool:
return self.structured_output_switch_on and self.structured_output is not None

View File

@ -12,9 +12,7 @@ from sqlalchemy.orm import Session
from configs import dify_config
from core.app.entities.app_invoke_entities import ModelConfigWithCredentialsEntity
from core.entities.model_entities import ModelStatus
from core.entities.provider_entities import QuotaUnit
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.file import FileType, file_manager
from core.helper.code_executor import CodeExecutor, CodeLanguage
from core.memory.token_buffer_memory import TokenBufferMemory
@ -74,7 +72,6 @@ from core.workflow.nodes.event import (
from core.workflow.utils.structured_output.entities import (
ResponseFormat,
SpecialModelType,
SupportStructuredOutputStatus,
)
from core.workflow.utils.structured_output.prompt import STRUCTURED_OUTPUT_PROMPT
from core.workflow.utils.variable_template_parser import VariableTemplateParser
@ -277,7 +274,7 @@ class LLMNode(BaseNode[LLMNodeData]):
llm_usage=usage,
)
)
except LLMNodeError as e:
except ValueError as e:
yield RunCompletedEvent(
run_result=NodeRunResult(
status=WorkflowNodeExecutionStatus.FAILED,
@ -527,65 +524,53 @@ class LLMNode(BaseNode[LLMNodeData]):
def _fetch_model_config(
self, node_data_model: ModelConfig
) -> tuple[ModelInstance, ModelConfigWithCredentialsEntity]:
model_name = node_data_model.name
provider_name = node_data_model.provider
if not node_data_model.mode:
raise LLMModeRequiredError("LLM mode is required.")
model_manager = ModelManager()
model_instance = model_manager.get_model_instance(
tenant_id=self.tenant_id, model_type=ModelType.LLM, provider=provider_name, model=model_name
model = ModelManager().get_model_instance(
tenant_id=self.tenant_id,
model_type=ModelType.LLM,
provider=node_data_model.provider,
model=node_data_model.name,
)
provider_model_bundle = model_instance.provider_model_bundle
model_type_instance = model_instance.model_type_instance
model_type_instance = cast(LargeLanguageModel, model_type_instance)
model_credentials = model_instance.credentials
model.model_type_instance = cast(LargeLanguageModel, model.model_type_instance)
# check model
provider_model = provider_model_bundle.configuration.get_provider_model(
model=model_name, model_type=ModelType.LLM
provider_model = model.provider_model_bundle.configuration.get_provider_model(
model=node_data_model.name, model_type=ModelType.LLM
)
if provider_model is None:
raise ModelNotExistError(f"Model {model_name} not exist.")
if provider_model.status == ModelStatus.NO_CONFIGURE:
raise ProviderTokenNotInitError(f"Model {model_name} credentials is not initialized.")
elif provider_model.status == ModelStatus.NO_PERMISSION:
raise ModelCurrentlyNotSupportError(f"Dify Hosted OpenAI {model_name} currently not support.")
elif provider_model.status == ModelStatus.QUOTA_EXCEEDED:
raise QuotaExceededError(f"Model provider {provider_name} quota exceeded.")
raise ModelNotExistError(f"Model {node_data_model.name} not exist.")
provider_model.raise_for_status()
# model config
completion_params = node_data_model.completion_params
stop = []
if "stop" in completion_params:
stop = completion_params["stop"]
del completion_params["stop"]
# get model mode
model_mode = node_data_model.mode
if not model_mode:
raise LLMModeRequiredError("LLM mode is required.")
model_schema = model_type_instance.get_model_schema(model_name, model_credentials)
stop: list[str] = []
if "stop" in node_data_model.completion_params:
stop = node_data_model.completion_params.pop("stop")
model_schema = model.model_type_instance.get_model_schema(node_data_model.name, model.credentials)
if not model_schema:
raise ModelNotExistError(f"Model {model_name} not exist.")
support_structured_output = self._check_model_structured_output_support()
if support_structured_output == SupportStructuredOutputStatus.SUPPORTED:
completion_params = self._handle_native_json_schema(completion_params, model_schema.parameter_rules)
elif support_structured_output == SupportStructuredOutputStatus.UNSUPPORTED:
# Set appropriate response format based on model capabilities
self._set_response_format(completion_params, model_schema.parameter_rules)
return model_instance, ModelConfigWithCredentialsEntity(
provider=provider_name,
model=model_name,
raise ModelNotExistError(f"Model {node_data_model.name} not exist.")
if self.node_data.structured_output_enabled:
if model_schema.support_structure_output:
node_data_model.completion_params = self._handle_native_json_schema(
node_data_model.completion_params, model_schema.parameter_rules
)
else:
# Set appropriate response format based on model capabilities
self._set_response_format(node_data_model.completion_params, model_schema.parameter_rules)
return model, ModelConfigWithCredentialsEntity(
provider=node_data_model.provider,
model=node_data_model.name,
model_schema=model_schema,
mode=model_mode,
provider_model_bundle=provider_model_bundle,
credentials=model_credentials,
parameters=completion_params,
mode=node_data_model.mode,
provider_model_bundle=model.provider_model_bundle,
credentials=model.credentials,
parameters=node_data_model.completion_params,
stop=stop,
)
@ -786,13 +771,25 @@ class LLMNode(BaseNode[LLMNodeData]):
"No prompt found in the LLM configuration. "
"Please ensure a prompt is properly configured before proceeding."
)
support_structured_output = self._check_model_structured_output_support()
if support_structured_output == SupportStructuredOutputStatus.UNSUPPORTED:
filtered_prompt_messages = self._handle_prompt_based_schema(
prompt_messages=filtered_prompt_messages,
)
stop = model_config.stop
return filtered_prompt_messages, stop
model = ModelManager().get_model_instance(
tenant_id=self.tenant_id,
model_type=ModelType.LLM,
provider=self.node_data.model.provider,
model=self.node_data.model.name,
)
model_schema = model.model_type_instance.get_model_schema(
model=self.node_data.model.name,
credentials=model.credentials,
)
if not model_schema:
raise ModelNotExistError(f"Model {self.node_data.model.name} not exist.")
if self.node_data.structured_output_enabled:
if not model_schema.support_structure_output:
filtered_prompt_messages = self._handle_prompt_based_schema(
prompt_messages=filtered_prompt_messages,
)
return filtered_prompt_messages, model_config.stop
def _parse_structured_output(self, result_text: str) -> dict[str, Any]:
structured_output: dict[str, Any] = {}
@ -903,7 +900,7 @@ class LLMNode(BaseNode[LLMNodeData]):
variable_mapping["#context#"] = node_data.context.variable_selector
if node_data.vision.enabled:
variable_mapping["#files#"] = ["sys", SystemVariableKey.FILES.value]
variable_mapping["#files#"] = node_data.vision.configs.variable_selector
if node_data.memory:
variable_mapping["#sys.query#"] = ["sys", SystemVariableKey.QUERY.value]
@ -1185,32 +1182,6 @@ class LLMNode(BaseNode[LLMNodeData]):
except json.JSONDecodeError:
raise LLMNodeError("structured_output_schema is not valid JSON format")
def _check_model_structured_output_support(self) -> SupportStructuredOutputStatus:
"""
Check if the current model supports structured output.
Returns:
SupportStructuredOutput: The support status of structured output
"""
# Early return if structured output is disabled
if (
not isinstance(self.node_data, LLMNodeData)
or not self.node_data.structured_output_enabled
or not self.node_data.structured_output
):
return SupportStructuredOutputStatus.DISABLED
# Get model schema and check if it exists
model_schema = self._fetch_model_schema(self.node_data.model.provider)
if not model_schema:
return SupportStructuredOutputStatus.DISABLED
# Check if model supports structured output feature
return (
SupportStructuredOutputStatus.SUPPORTED
if bool(model_schema.features and ModelFeature.STRUCTURED_OUTPUT in model_schema.features)
else SupportStructuredOutputStatus.UNSUPPORTED
)
def _save_multimodal_output_and_convert_result_to_markdown(
self,
contents: str | list[PromptMessageContentUnionTypes] | None,

View File

@ -14,11 +14,3 @@ class SpecialModelType(StrEnum):
GEMINI = "gemini"
OLLAMA = "ollama"
class SupportStructuredOutputStatus(StrEnum):
"""Constants for structured output support status"""
SUPPORTED = "supported"
UNSUPPORTED = "unsupported"
DISABLED = "disabled"

View File

@ -70,6 +70,7 @@ def init_app(app: DifyApp) -> Celery:
"schedule.update_tidb_serverless_status_task",
"schedule.clean_messages",
"schedule.mail_clean_document_notify_task",
"schedule.queue_monitor_task",
]
day = dify_config.CELERY_BEAT_SCHEDULER_TIME
beat_schedule = {
@ -98,6 +99,12 @@ def init_app(app: DifyApp) -> Celery:
"task": "schedule.mail_clean_document_notify_task.mail_clean_document_notify_task",
"schedule": crontab(minute="0", hour="10", day_of_week="1"),
},
"datasets-queue-monitor": {
"task": "schedule.queue_monitor_task.queue_monitor_task",
"schedule": timedelta(
minutes=dify_config.QUEUE_MONITOR_INTERVAL if dify_config.QUEUE_MONITOR_INTERVAL else 30
),
},
}
celery_app.conf.update(beat_schedule=beat_schedule, imports=imports)

View File

@ -1,7 +1,7 @@
import json
import logging
import random
import re
import secrets
import string
import subprocess
import time
@ -18,6 +18,7 @@ from flask_restful import fields
from configs import dify_config
from core.app.features.rate_limiting.rate_limit import RateLimitGenerator
from core.file import helpers as file_helpers
from core.model_runtime.utils.encoders import jsonable_encoder
from extensions.ext_redis import redis_client
if TYPE_CHECKING:
@ -175,7 +176,7 @@ def generate_string(n):
letters_digits = string.ascii_letters + string.digits
result = ""
for i in range(n):
result += random.choice(letters_digits)
result += secrets.choice(letters_digits)
return result
@ -196,7 +197,7 @@ def generate_text_hash(text: str) -> str:
def compact_generate_response(response: Union[Mapping, Generator, RateLimitGenerator]) -> Response:
if isinstance(response, dict):
return Response(response=json.dumps(response), status=200, mimetype="application/json")
return Response(response=json.dumps(jsonable_encoder(response)), status=200, mimetype="application/json")
else:
def generate() -> Generator:

View File

@ -0,0 +1,60 @@
"""`workflow_draft_varaibles` add `node_execution_id` column, add an index for `workflow_node_executions`.
Revision ID: 4474872b0ee6
Revises: 2adcbe1f5dfb
Create Date: 2025-06-06 14:24:44.213018
"""
from alembic import op
import models as models
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '4474872b0ee6'
down_revision = '2adcbe1f5dfb'
branch_labels = None
depends_on = None
def upgrade():
# `CREATE INDEX CONCURRENTLY` cannot run within a transaction, so use the `autocommit_block`
# context manager to wrap the index creation statement.
# Reference:
#
# - https://www.postgresql.org/docs/current/sql-createindex.html#:~:text=Another%20difference%20is,CREATE%20INDEX%20CONCURRENTLY%20cannot.
# - https://alembic.sqlalchemy.org/en/latest/api/runtime.html#alembic.runtime.migration.MigrationContext.autocommit_block
with op.get_context().autocommit_block():
op.create_index(
op.f('workflow_node_executions_tenant_id_idx'),
"workflow_node_executions",
['tenant_id', 'workflow_id', 'node_id', sa.literal_column('created_at DESC')],
unique=False,
postgresql_concurrently=True,
)
with op.batch_alter_table('workflow_draft_variables', schema=None) as batch_op:
batch_op.add_column(sa.Column('node_execution_id', models.types.StringUUID(), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
# `DROP INDEX CONCURRENTLY` cannot run within a transaction, so use the `autocommit_block`
# context manager to wrap the index creation statement.
# Reference:
#
# - https://www.postgresql.org/docs/current/sql-createindex.html#:~:text=Another%20difference%20is,CREATE%20INDEX%20CONCURRENTLY%20cannot.
# - https://alembic.sqlalchemy.org/en/latest/api/runtime.html#alembic.runtime.migration.MigrationContext.autocommit_block
# `DROP INDEX CONCURRENTLY` cannot run within a transaction, so commit existing transactions first.
# Reference:
#
# https://www.postgresql.org/docs/current/sql-createindex.html#:~:text=Another%20difference%20is,CREATE%20INDEX%20CONCURRENTLY%20cannot.
with op.get_context().autocommit_block():
op.drop_index(op.f('workflow_node_executions_tenant_id_idx'), postgresql_concurrently=True)
with op.batch_alter_table('workflow_draft_variables', schema=None) as batch_op:
batch_op.drop_column('node_execution_id')
# ### end Alembic commands ###

View File

@ -1,6 +1,9 @@
from datetime import datetime
from enum import Enum
from typing import Optional
from sqlalchemy import func
from sqlalchemy import func, text
from sqlalchemy.orm import Mapped, mapped_column
from .base import Base
from .engine import db
@ -51,20 +54,24 @@ class Provider(Base):
),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
provider_type = db.Column(db.String(40), nullable=False, server_default=db.text("'custom'::character varying"))
encrypted_config = db.Column(db.Text, nullable=True)
is_valid = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
last_used = db.Column(db.DateTime, nullable=True)
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
provider_type: Mapped[str] = mapped_column(
db.String(40), nullable=False, server_default=text("'custom'::character varying")
)
encrypted_config: Mapped[Optional[str]] = mapped_column(db.Text, nullable=True)
is_valid: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=text("false"))
last_used: Mapped[Optional[datetime]] = mapped_column(db.DateTime, nullable=True)
quota_type = db.Column(db.String(40), nullable=True, server_default=db.text("''::character varying"))
quota_limit = db.Column(db.BigInteger, nullable=True)
quota_used = db.Column(db.BigInteger, default=0)
quota_type: Mapped[Optional[str]] = mapped_column(
db.String(40), nullable=True, server_default=text("''::character varying")
)
quota_limit: Mapped[Optional[int]] = mapped_column(db.BigInteger, nullable=True)
quota_used: Mapped[Optional[int]] = mapped_column(db.BigInteger, default=0)
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
def __repr__(self):
return (
@ -104,15 +111,15 @@ class ProviderModel(Base):
),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
model_name = db.Column(db.String(255), nullable=False)
model_type = db.Column(db.String(40), nullable=False)
encrypted_config = db.Column(db.Text, nullable=True)
is_valid = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_type: Mapped[str] = mapped_column(db.String(40), nullable=False)
encrypted_config: Mapped[Optional[str]] = mapped_column(db.Text, nullable=True)
is_valid: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=text("false"))
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
class TenantDefaultModel(Base):
@ -122,13 +129,13 @@ class TenantDefaultModel(Base):
db.Index("tenant_default_model_tenant_id_provider_type_idx", "tenant_id", "provider_name", "model_type"),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
model_name = db.Column(db.String(255), nullable=False)
model_type = db.Column(db.String(40), nullable=False)
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_type: Mapped[str] = mapped_column(db.String(40), nullable=False)
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
class TenantPreferredModelProvider(Base):
@ -138,12 +145,12 @@ class TenantPreferredModelProvider(Base):
db.Index("tenant_preferred_model_provider_tenant_provider_idx", "tenant_id", "provider_name"),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
preferred_provider_type = db.Column(db.String(40), nullable=False)
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
preferred_provider_type: Mapped[str] = mapped_column(db.String(40), nullable=False)
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
class ProviderOrder(Base):
@ -153,22 +160,24 @@ class ProviderOrder(Base):
db.Index("provider_order_tenant_provider_idx", "tenant_id", "provider_name"),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
account_id = db.Column(StringUUID, nullable=False)
payment_product_id = db.Column(db.String(191), nullable=False)
payment_id = db.Column(db.String(191))
transaction_id = db.Column(db.String(191))
quantity = db.Column(db.Integer, nullable=False, server_default=db.text("1"))
currency = db.Column(db.String(40))
total_amount = db.Column(db.Integer)
payment_status = db.Column(db.String(40), nullable=False, server_default=db.text("'wait_pay'::character varying"))
paid_at = db.Column(db.DateTime)
pay_failed_at = db.Column(db.DateTime)
refunded_at = db.Column(db.DateTime)
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
account_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
payment_product_id: Mapped[str] = mapped_column(db.String(191), nullable=False)
payment_id: Mapped[Optional[str]] = mapped_column(db.String(191))
transaction_id: Mapped[Optional[str]] = mapped_column(db.String(191))
quantity: Mapped[int] = mapped_column(db.Integer, nullable=False, server_default=text("1"))
currency: Mapped[Optional[str]] = mapped_column(db.String(40))
total_amount: Mapped[Optional[int]] = mapped_column(db.Integer)
payment_status: Mapped[str] = mapped_column(
db.String(40), nullable=False, server_default=text("'wait_pay'::character varying")
)
paid_at: Mapped[Optional[datetime]] = mapped_column(db.DateTime)
pay_failed_at: Mapped[Optional[datetime]] = mapped_column(db.DateTime)
refunded_at: Mapped[Optional[datetime]] = mapped_column(db.DateTime)
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
class ProviderModelSetting(Base):
@ -182,15 +191,15 @@ class ProviderModelSetting(Base):
db.Index("provider_model_setting_tenant_provider_model_idx", "tenant_id", "provider_name", "model_type"),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
model_name = db.Column(db.String(255), nullable=False)
model_type = db.Column(db.String(40), nullable=False)
enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
load_balancing_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_type: Mapped[str] = mapped_column(db.String(40), nullable=False)
enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=text("true"))
load_balancing_enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=text("false"))
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
class LoadBalancingModelConfig(Base):
@ -204,13 +213,13 @@ class LoadBalancingModelConfig(Base):
db.Index("load_balancing_model_config_tenant_provider_model_idx", "tenant_id", "provider_name", "model_type"),
)
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
provider_name = db.Column(db.String(255), nullable=False)
model_name = db.Column(db.String(255), nullable=False)
model_type = db.Column(db.String(40), nullable=False)
name = db.Column(db.String(255), nullable=False)
encrypted_config = db.Column(db.Text, nullable=True)
enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
id: Mapped[str] = mapped_column(StringUUID, server_default=text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
provider_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_name: Mapped[str] = mapped_column(db.String(255), nullable=False)
model_type: Mapped[str] = mapped_column(db.String(40), nullable=False)
name: Mapped[str] = mapped_column(db.String(255), nullable=False)
encrypted_config: Mapped[Optional[str]] = mapped_column(db.Text, nullable=True)
enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=text("true"))
created_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
updated_at: Mapped[datetime] = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())

View File

@ -16,8 +16,8 @@ if TYPE_CHECKING:
from models.model import AppMode
import sqlalchemy as sa
from sqlalchemy import UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import Index, PrimaryKeyConstraint, UniqueConstraint, func
from sqlalchemy.orm import Mapped, declared_attr, mapped_column
from constants import DEFAULT_FILE_NUMBER_LIMITS, HIDDEN_VALUE
from core.helper import encrypter
@ -590,28 +590,48 @@ class WorkflowNodeExecutionModel(Base):
"""
__tablename__ = "workflow_node_executions"
__table_args__ = (
db.PrimaryKeyConstraint("id", name="workflow_node_execution_pkey"),
db.Index(
"workflow_node_execution_workflow_run_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"workflow_run_id",
),
db.Index(
"workflow_node_execution_node_run_idx", "tenant_id", "app_id", "workflow_id", "triggered_from", "node_id"
),
db.Index(
"workflow_node_execution_id_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"node_execution_id",
),
)
@declared_attr
def __table_args__(cls): # noqa
return (
PrimaryKeyConstraint("id", name="workflow_node_execution_pkey"),
Index(
"workflow_node_execution_workflow_run_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"workflow_run_id",
),
Index(
"workflow_node_execution_node_run_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"node_id",
),
Index(
"workflow_node_execution_id_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"node_execution_id",
),
Index(
# The first argument is the index name,
# which we leave as `None`` to allow auto-generation by the ORM.
None,
cls.tenant_id,
cls.workflow_id,
cls.node_id,
# MyPy may flag the following line because it doesn't recognize that
# the `declared_attr` decorator passes the receiving class as the first
# argument to this method, allowing us to reference class attributes.
cls.created_at.desc(), # type: ignore
),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID)
@ -885,14 +905,29 @@ class WorkflowDraftVariable(Base):
selector: Mapped[str] = mapped_column(sa.String(255), nullable=False, name="selector")
# The data type of this variable's value
value_type: Mapped[SegmentType] = mapped_column(EnumText(SegmentType, length=20))
# JSON string
# The variable's value serialized as a JSON string
value: Mapped[str] = mapped_column(sa.Text, nullable=False, name="value")
# visible
# Controls whether the variable should be displayed in the variable inspection panel
visible: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, default=True)
# Determines whether this variable can be modified by users
editable: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, default=False)
# The `node_execution_id` field identifies the workflow node execution that created this variable.
# It corresponds to the `id` field in the `WorkflowNodeExecutionModel` model.
#
# This field is not `None` for system variables and node variables, and is `None`
# for conversation variables.
node_execution_id: Mapped[str | None] = mapped_column(
StringUUID,
nullable=True,
default=None,
)
def get_selector(self) -> list[str]:
selector = json.loads(self.selector)
if not isinstance(selector, list):

View File

@ -0,0 +1,62 @@
import logging
from datetime import datetime
from urllib.parse import urlparse
import click
from flask import render_template
from redis import Redis
import app
from configs import dify_config
from extensions.ext_database import db
from extensions.ext_mail import mail
# Create a dedicated Redis connection (using the same configuration as Celery)
celery_broker_url = dify_config.CELERY_BROKER_URL
parsed = urlparse(celery_broker_url)
host = parsed.hostname or "localhost"
port = parsed.port or 6379
password = parsed.password or None
redis_db = parsed.path.strip("/") or "1" # type: ignore
celery_redis = Redis(host=host, port=port, password=password, db=redis_db)
@app.celery.task(queue="monitor")
def queue_monitor_task():
queue_name = "dataset"
threshold = dify_config.QUEUE_MONITOR_THRESHOLD
try:
queue_length = celery_redis.llen(f"{queue_name}")
logging.info(click.style(f"Start monitor {queue_name}", fg="green"))
logging.info(click.style(f"Queue length: {queue_length}", fg="green"))
if queue_length >= threshold:
warning_msg = f"Queue {queue_name} task count exceeded the limit.: {queue_length}/{threshold}"
logging.warning(click.style(warning_msg, fg="red"))
alter_emails = dify_config.QUEUE_MONITOR_ALERT_EMAILS
if alter_emails:
to_list = alter_emails.split(",")
for to in to_list:
try:
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
html_content = render_template(
"queue_monitor_alert_email_template_en-US.html",
queue_name=queue_name,
queue_length=queue_length,
threshold=threshold,
alert_time=current_time,
)
mail.send(
to=to, subject="Alert: Dataset Queue pending tasks exceeded the limit", html=html_content
)
except Exception as e:
logging.exception(click.style("Exception occurred during sending email", fg="red"))
except Exception as e:
logging.exception(click.style("Exception occurred during queue monitoring", fg="red"))
finally:
if db.session.is_active:
db.session.close()

View File

@ -1,7 +1,6 @@
import base64
import json
import logging
import random
import secrets
import uuid
from datetime import UTC, datetime, timedelta
@ -261,7 +260,7 @@ class AccountService:
@staticmethod
def generate_account_deletion_verification_code(account: Account) -> tuple[str, str]:
code = "".join([str(random.randint(0, 9)) for _ in range(6)])
code = "".join([str(secrets.randbelow(exclusive_upper_bound=10)) for _ in range(6)])
token = TokenManager.generate_token(
account=account, token_type="account_deletion", additional_data={"code": code}
)
@ -429,7 +428,7 @@ class AccountService:
additional_data: dict[str, Any] = {},
):
if not code:
code = "".join([str(random.randint(0, 9)) for _ in range(6)])
code = "".join([str(secrets.randbelow(exclusive_upper_bound=10)) for _ in range(6)])
additional_data["code"] = code
token = TokenManager.generate_token(
account=account, email=email, token_type="reset_password", additional_data=additional_data
@ -456,7 +455,7 @@ class AccountService:
raise EmailCodeLoginRateLimitExceededError()
code = "".join([str(random.randint(0, 9)) for _ in range(6)])
code = "".join([str(secrets.randbelow(exclusive_upper_bound=10)) for _ in range(6)])
token = TokenManager.generate_token(
account=account, email=email, token_type="email_code_login", additional_data={"code": code}
)

View File

@ -2,7 +2,7 @@ import copy
import datetime
import json
import logging
import random
import secrets
import time
import uuid
from collections import Counter
@ -970,7 +970,7 @@ class DocumentService:
documents.append(document)
batch = document.batch
else:
batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
batch = time.strftime("%Y%m%d%H%M%S") + str(100000 + secrets.randbelow(exclusive_upper_bound=900000))
# save process rule
if not dataset_process_rule:
process_rule = knowledge_config.process_rule

View File

@ -46,6 +46,8 @@ class TagService:
@staticmethod
def get_tag_by_tag_name(tag_type: str, current_tenant_id: str, tag_name: str) -> list:
if not tag_type or not tag_name:
return []
tags = (
db.session.query(Tag)
.filter(Tag.name == tag_name, Tag.tenant_id == current_tenant_id, Tag.type == tag_type)
@ -88,7 +90,7 @@ class TagService:
@staticmethod
def update_tags(args: dict, tag_id: str) -> Tag:
if TagService.get_tag_by_tag_name(args["type"], current_user.current_tenant_id, args["name"]):
if TagService.get_tag_by_tag_name(args.get("type", ""), current_user.current_tenant_id, args.get("name", "")):
raise ValueError("Tag name already exists")
tag = db.session.query(Tag).filter(Tag.id == tag_id).first()
if not tag:

View File

@ -1,5 +1,5 @@
import enum
import random
import secrets
from datetime import UTC, datetime, timedelta
from typing import Any, Optional, cast
@ -69,7 +69,7 @@ class WebAppAuthService:
if email is None:
raise ValueError("Email must be provided.")
code = "".join([str(random.randint(0, 9)) for _ in range(6)])
code = "".join([str(secrets.randbelow(exclusive_upper_bound=10)) for _ in range(6)])
token = TokenManager.generate_token(
account=account, email=email, token_type="email_code_login", additional_data={"code": code}
)

View File

@ -5,7 +5,7 @@ import uuid
import click
from celery import shared_task # type: ignore
from sqlalchemy import func, select
from sqlalchemy import func
from sqlalchemy.orm import Session
from core.model_manager import ModelManager
@ -68,11 +68,6 @@ def batch_create_segment_to_index_task(
model_type=ModelType.TEXT_EMBEDDING,
model=dataset.embedding_model,
)
word_count_change = 0
segments_to_insert: list[str] = []
max_position_stmt = select(func.max(DocumentSegment.position)).where(
DocumentSegment.document_id == dataset_document.id
)
word_count_change = 0
if embedding_model:
tokens_list = embedding_model.get_text_embedding_num_tokens(

View File

@ -0,0 +1,129 @@
<!DOCTYPE html>
<html>
<head>
<style>
body {
font-family: 'Arial', sans-serif;
line-height: 16pt;
color: #101828;
background-color: #e9ebf0;
margin: 0;
padding: 0;
}
.container {
width: 600px;
min-height: 605px;
margin: 40px auto;
padding: 36px 48px;
background-color: #fcfcfd;
border-radius: 16px;
border: 1px solid #ffffff;
box-shadow: 0 2px 4px -2px rgba(9, 9, 11, 0.08);
}
.header {
margin-bottom: 24px;
}
.header img {
max-width: 100px;
height: auto;
}
.title {
font-weight: 600;
font-size: 24px;
line-height: 28.8px;
}
.description {
font-size: 13px;
line-height: 16px;
color: #676f83;
margin-top: 12px;
}
.alert-content {
padding: 16px 32px;
text-align: center;
border-radius: 16px;
background-color: #fef0f0;
margin: 16px auto;
border: 1px solid #fda29b;
}
.alert-title {
line-height: 24px;
font-weight: 700;
font-size: 18px;
color: #d92d20;
}
.alert-detail {
line-height: 20px;
font-size: 14px;
margin-top: 8px;
}
.typography {
letter-spacing: -0.07px;
font-weight: 400;
font-style: normal;
font-size: 14px;
line-height: 20px;
color: #354052;
margin-top: 12px;
margin-bottom: 12px;
}
.typography p{
margin: 0 auto;
}
.typography-title {
color: #101828;
font-size: 14px;
font-style: normal;
font-weight: 600;
line-height: 20px;
margin-top: 12px;
margin-bottom: 4px;
}
.tip-list{
margin: 0;
padding-left: 10px;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<img src="https://assets.dify.ai/images/logo.png" alt="Dify Logo" />
</div>
<p class="title">Queue Monitoring Alert</p>
<p class="typography">Our system has detected an abnormal queue status that requires your attention:</p>
<div class="alert-content">
<div class="alert-title">Queue Task Alert</div>
<div class="alert-detail">
Queue "{{queue_name}}" has {{queue_length}} pending tasks (Threshold: {{threshold}})
</div>
</div>
<div class="typography">
<p style="margin-bottom:4px">Recommended actions:</p>
<p>1. Check the queue processing status in the system dashboard</p>
<p>2. Verify if there are any processing bottlenecks</p>
<p>3. Consider scaling up workers if needed</p>
</div>
<p class="typography-title">Additional Information:</p>
<ul class="typography tip-list">
<li>Alert triggered at: {{alert_time}}</li>
</ul>
</div>
</body>
</html>

View File

@ -3,11 +3,16 @@ import os
import time
import uuid
from collections.abc import Generator
from unittest.mock import MagicMock
from decimal import Decimal
from unittest.mock import MagicMock, patch
import pytest
from app_factory import create_app
from configs import dify_config
from core.app.entities.app_invoke_entities import InvokeFrom
from core.model_runtime.entities.llm_entities import LLMResult, LLMUsage
from core.model_runtime.entities.message_entities import AssistantPromptMessage
from core.workflow.entities.variable_pool import VariablePool
from core.workflow.entities.workflow_node_execution import WorkflowNodeExecutionStatus
from core.workflow.enums import SystemVariableKey
@ -19,13 +24,27 @@ from core.workflow.nodes.llm.node import LLMNode
from extensions.ext_database import db
from models.enums import UserFrom
from models.workflow import WorkflowType
from tests.integration_tests.workflow.nodes.__mock.model import get_mocked_fetch_model_config
"""FOR MOCK FIXTURES, DO NOT REMOVE"""
from tests.integration_tests.model_runtime.__mock.plugin_daemon import setup_model_mock
from tests.integration_tests.workflow.nodes.__mock.code_executor import setup_code_executor_mock
@pytest.fixture(scope="session")
def app():
# Set up storage configuration
os.environ["STORAGE_TYPE"] = "opendal"
os.environ["OPENDAL_SCHEME"] = "fs"
os.environ["OPENDAL_FS_ROOT"] = "storage"
# Ensure storage directory exists
os.makedirs("storage", exist_ok=True)
app = create_app()
dify_config.LOGIN_DISABLED = True
return app
def init_llm_node(config: dict) -> LLMNode:
graph_config = {
"edges": [
@ -40,13 +59,19 @@ def init_llm_node(config: dict) -> LLMNode:
graph = Graph.init(graph_config=graph_config)
# Use proper UUIDs for database compatibility
tenant_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056b"
app_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056c"
workflow_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056d"
user_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056e"
init_params = GraphInitParams(
tenant_id="1",
app_id="1",
tenant_id=tenant_id,
app_id=app_id,
workflow_type=WorkflowType.WORKFLOW,
workflow_id="1",
workflow_id=workflow_id,
graph_config=graph_config,
user_id="1",
user_id=user_id,
user_from=UserFrom.ACCOUNT,
invoke_from=InvokeFrom.DEBUGGER,
call_depth=0,
@ -77,115 +102,197 @@ def init_llm_node(config: dict) -> LLMNode:
return node
def test_execute_llm(setup_model_mock):
node = init_llm_node(
config={
"id": "llm",
"data": {
"title": "123",
"type": "llm",
"model": {
"provider": "langgenius/openai/openai",
"name": "gpt-3.5-turbo",
"mode": "chat",
"completion_params": {},
def test_execute_llm(app):
with app.app_context():
node = init_llm_node(
config={
"id": "llm",
"data": {
"title": "123",
"type": "llm",
"model": {
"provider": "langgenius/openai/openai",
"name": "gpt-3.5-turbo",
"mode": "chat",
"completion_params": {},
},
"prompt_template": [
{
"role": "system",
"text": "you are a helpful assistant.\ntoday's weather is {{#abc.output#}}.",
},
{"role": "user", "text": "{{#sys.query#}}"},
],
"memory": None,
"context": {"enabled": False},
"vision": {"enabled": False},
},
"prompt_template": [
{"role": "system", "text": "you are a helpful assistant.\ntoday's weather is {{#abc.output#}}."},
{"role": "user", "text": "{{#sys.query#}}"},
],
"memory": None,
"context": {"enabled": False},
"vision": {"enabled": False},
},
},
)
)
credentials = {"openai_api_key": os.environ.get("OPENAI_API_KEY")}
credentials = {"openai_api_key": os.environ.get("OPENAI_API_KEY")}
# Mock db.session.close()
db.session.close = MagicMock()
# Create a proper LLM result with real entities
mock_usage = LLMUsage(
prompt_tokens=30,
prompt_unit_price=Decimal("0.001"),
prompt_price_unit=Decimal("1000"),
prompt_price=Decimal("0.00003"),
completion_tokens=20,
completion_unit_price=Decimal("0.002"),
completion_price_unit=Decimal("1000"),
completion_price=Decimal("0.00004"),
total_tokens=50,
total_price=Decimal("0.00007"),
currency="USD",
latency=0.5,
)
node._fetch_model_config = get_mocked_fetch_model_config(
provider="langgenius/openai/openai",
model="gpt-3.5-turbo",
mode="chat",
credentials=credentials,
)
mock_message = AssistantPromptMessage(content="This is a test response from the mocked LLM.")
# execute node
result = node._run()
assert isinstance(result, Generator)
mock_llm_result = LLMResult(
model="gpt-3.5-turbo",
prompt_messages=[],
message=mock_message,
usage=mock_usage,
)
for item in result:
if isinstance(item, RunCompletedEvent):
assert item.run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED
assert item.run_result.process_data is not None
assert item.run_result.outputs is not None
assert item.run_result.outputs.get("text") is not None
assert item.run_result.outputs.get("usage", {})["total_tokens"] > 0
# Create a simple mock model instance that doesn't call real providers
mock_model_instance = MagicMock()
mock_model_instance.invoke_llm.return_value = mock_llm_result
# Create a simple mock model config with required attributes
mock_model_config = MagicMock()
mock_model_config.mode = "chat"
mock_model_config.provider = "langgenius/openai/openai"
mock_model_config.model = "gpt-3.5-turbo"
mock_model_config.provider_model_bundle.configuration.tenant_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056b"
# Mock the _fetch_model_config method
def mock_fetch_model_config_func(_node_data_model):
return mock_model_instance, mock_model_config
# Also mock ModelManager.get_model_instance to avoid database calls
def mock_get_model_instance(_self, **kwargs):
return mock_model_instance
with (
patch.object(node, "_fetch_model_config", mock_fetch_model_config_func),
patch("core.model_manager.ModelManager.get_model_instance", mock_get_model_instance),
):
# execute node
result = node._run()
assert isinstance(result, Generator)
for item in result:
if isinstance(item, RunCompletedEvent):
assert item.run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED
assert item.run_result.process_data is not None
assert item.run_result.outputs is not None
assert item.run_result.outputs.get("text") is not None
assert item.run_result.outputs.get("usage", {})["total_tokens"] > 0
@pytest.mark.parametrize("setup_code_executor_mock", [["none"]], indirect=True)
def test_execute_llm_with_jinja2(setup_code_executor_mock, setup_model_mock):
def test_execute_llm_with_jinja2(app, setup_code_executor_mock):
"""
Test execute LLM node with jinja2
"""
node = init_llm_node(
config={
"id": "llm",
"data": {
"title": "123",
"type": "llm",
"model": {"provider": "openai", "name": "gpt-3.5-turbo", "mode": "chat", "completion_params": {}},
"prompt_config": {
"jinja2_variables": [
{"variable": "sys_query", "value_selector": ["sys", "query"]},
{"variable": "output", "value_selector": ["abc", "output"]},
]
with app.app_context():
node = init_llm_node(
config={
"id": "llm",
"data": {
"title": "123",
"type": "llm",
"model": {"provider": "openai", "name": "gpt-3.5-turbo", "mode": "chat", "completion_params": {}},
"prompt_config": {
"jinja2_variables": [
{"variable": "sys_query", "value_selector": ["sys", "query"]},
{"variable": "output", "value_selector": ["abc", "output"]},
]
},
"prompt_template": [
{
"role": "system",
"text": "you are a helpful assistant.\ntoday's weather is {{#abc.output#}}",
"jinja2_text": "you are a helpful assistant.\ntoday's weather is {{output}}.",
"edition_type": "jinja2",
},
{
"role": "user",
"text": "{{#sys.query#}}",
"jinja2_text": "{{sys_query}}",
"edition_type": "basic",
},
],
"memory": None,
"context": {"enabled": False},
"vision": {"enabled": False},
},
"prompt_template": [
{
"role": "system",
"text": "you are a helpful assistant.\ntoday's weather is {{#abc.output#}}",
"jinja2_text": "you are a helpful assistant.\ntoday's weather is {{output}}.",
"edition_type": "jinja2",
},
{
"role": "user",
"text": "{{#sys.query#}}",
"jinja2_text": "{{sys_query}}",
"edition_type": "basic",
},
],
"memory": None,
"context": {"enabled": False},
"vision": {"enabled": False},
},
},
)
)
credentials = {"openai_api_key": os.environ.get("OPENAI_API_KEY")}
# Mock db.session.close()
db.session.close = MagicMock()
# Mock db.session.close()
db.session.close = MagicMock()
# Create a proper LLM result with real entities
mock_usage = LLMUsage(
prompt_tokens=30,
prompt_unit_price=Decimal("0.001"),
prompt_price_unit=Decimal("1000"),
prompt_price=Decimal("0.00003"),
completion_tokens=20,
completion_unit_price=Decimal("0.002"),
completion_price_unit=Decimal("1000"),
completion_price=Decimal("0.00004"),
total_tokens=50,
total_price=Decimal("0.00007"),
currency="USD",
latency=0.5,
)
node._fetch_model_config = get_mocked_fetch_model_config(
provider="langgenius/openai/openai",
model="gpt-3.5-turbo",
mode="chat",
credentials=credentials,
)
mock_message = AssistantPromptMessage(content="Test response: sunny weather and what's the weather today?")
# execute node
result = node._run()
mock_llm_result = LLMResult(
model="gpt-3.5-turbo",
prompt_messages=[],
message=mock_message,
usage=mock_usage,
)
for item in result:
if isinstance(item, RunCompletedEvent):
assert item.run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED
assert item.run_result.process_data is not None
assert "sunny" in json.dumps(item.run_result.process_data)
assert "what's the weather today?" in json.dumps(item.run_result.process_data)
# Create a simple mock model instance that doesn't call real providers
mock_model_instance = MagicMock()
mock_model_instance.invoke_llm.return_value = mock_llm_result
# Create a simple mock model config with required attributes
mock_model_config = MagicMock()
mock_model_config.mode = "chat"
mock_model_config.provider = "openai"
mock_model_config.model = "gpt-3.5-turbo"
mock_model_config.provider_model_bundle.configuration.tenant_id = "9d2074fc-6f86-45a9-b09d-6ecc63b9056b"
# Mock the _fetch_model_config method
def mock_fetch_model_config_func(_node_data_model):
return mock_model_instance, mock_model_config
# Also mock ModelManager.get_model_instance to avoid database calls
def mock_get_model_instance(_self, **kwargs):
return mock_model_instance
with (
patch.object(node, "_fetch_model_config", mock_fetch_model_config_func),
patch("core.model_manager.ModelManager.get_model_instance", mock_get_model_instance),
):
# execute node
result = node._run()
for item in result:
if isinstance(item, RunCompletedEvent):
assert item.run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED
assert item.run_result.process_data is not None
assert "sunny" in json.dumps(item.run_result.process_data)
assert "what's the weather today?" in json.dumps(item.run_result.process_data)
def test_extract_json():

View File

@ -1,4 +1,4 @@
import random
import secrets
from unittest.mock import MagicMock, patch
import pytest
@ -34,7 +34,7 @@ def test_retry_logic_success(mock_request):
side_effects = []
for _ in range(SSRF_DEFAULT_MAX_RETRIES):
status_code = random.choice(STATUS_FORCELIST)
status_code = secrets.choice(STATUS_FORCELIST)
mock_response = MagicMock()
mock_response.status_code = status_code
side_effects.append(mock_response)

View File

@ -1,5 +1,7 @@
import io
from unittest.mock import Mock, patch
import pandas as pd
import pytest
from docx.oxml.text.paragraph import CT_P
@ -187,145 +189,134 @@ def test_node_type(document_extractor_node):
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_single_sheet(mock_excel_file):
"""Test extracting text from Excel file with single sheet."""
# Mock DataFrame
mock_df = Mock()
mock_df.dropna = Mock()
mock_df.to_markdown.return_value = "| Name | Age |\n|------|-----|\n| John | 25 |"
"""Test extracting text from Excel file with single sheet and multiline content."""
# Test multi-line cell
data = {"Name\nwith\nnewline": ["John\nDoe", "Jane\nSmith"], "Age": [25, 30]}
df = pd.DataFrame(data)
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["Sheet1"]
mock_excel_instance.parse.return_value = mock_df
mock_excel_instance.parse.return_value = df
mock_excel_file.return_value = mock_excel_instance
file_content = b"fake_excel_content"
result = _extract_text_from_excel(file_content)
expected_manual = "| Name with newline | Age |\n| ----------------- | --- |\n\
| John Doe | 25 |\n| Jane Smith | 30 |\n\n"
expected = "| Name | Age |\n|------|-----|\n| John | 25 |\n\n"
assert result == expected
mock_excel_file.assert_called_once()
mock_df.dropna.assert_called_once_with(how="all", inplace=True)
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt="")
assert expected_manual == result
mock_excel_instance.parse.assert_called_once_with(sheet_name="Sheet1")
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_multiple_sheets(mock_excel_file):
"""Test extracting text from Excel file with multiple sheets."""
# Mock DataFrames for different sheets
mock_df1 = Mock()
mock_df1.dropna = Mock()
mock_df1.to_markdown.return_value = "| Product | Price |\n|---------|-------|\n| Apple | 1.50 |"
"""Test extracting text from Excel file with multiple sheets and multiline content."""
mock_df2 = Mock()
mock_df2.dropna = Mock()
mock_df2.to_markdown.return_value = "| City | Population |\n|------|------------|\n| NYC | 8000000 |"
# Test multi-line cell
data1 = {"Product\nName": ["Apple\nRed", "Banana\nYellow"], "Price": [1.50, 0.99]}
df1 = pd.DataFrame(data1)
data2 = {"City\nName": ["New\nYork", "Los\nAngeles"], "Population": [8000000, 3900000]}
df2 = pd.DataFrame(data2)
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["Products", "Cities"]
mock_excel_instance.parse.side_effect = [mock_df1, mock_df2]
mock_excel_instance.parse.side_effect = [df1, df2]
mock_excel_file.return_value = mock_excel_instance
file_content = b"fake_excel_content_multiple_sheets"
result = _extract_text_from_excel(file_content)
expected = (
"| Product | Price |\n|---------|-------|\n| Apple | 1.50 |\n\n"
"| City | Population |\n|------|------------|\n| NYC | 8000000 |\n\n"
)
assert result == expected
expected_manual1 = "| Product Name | Price |\n| ------------ | ----- |\n\
| Apple Red | 1.5 |\n| Banana Yellow | 0.99 |\n\n"
expected_manual2 = "| City Name | Population |\n| --------- | ---------- |\n\
| New York | 8000000 |\n| Los Angeles | 3900000 |\n\n"
assert expected_manual1 in result
assert expected_manual2 in result
assert mock_excel_instance.parse.call_count == 2
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_empty_sheets(mock_excel_file):
"""Test extracting text from Excel file with empty sheets."""
# Mock empty DataFrame
mock_df = Mock()
mock_df.dropna = Mock()
mock_df.to_markdown.return_value = ""
# Empty excel
df = pd.DataFrame()
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["EmptySheet"]
mock_excel_instance.parse.return_value = mock_df
mock_excel_instance.parse.return_value = df
mock_excel_file.return_value = mock_excel_instance
file_content = b"fake_excel_empty_content"
result = _extract_text_from_excel(file_content)
expected = "\n\n"
expected = "| |\n| |\n\n"
assert result == expected
mock_excel_instance.parse.assert_called_once_with(sheet_name="EmptySheet")
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_sheet_parse_error(mock_excel_file):
"""Test handling of sheet parsing errors - should continue with other sheets."""
# Mock DataFrames - one successful, one that raises exception
mock_df_success = Mock()
mock_df_success.dropna = Mock()
mock_df_success.to_markdown.return_value = "| Data | Value |\n|------|-------|\n| Test | 123 |"
# Test error
data = {"Data": ["Test"], "Value": [123]}
df = pd.DataFrame(data)
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["GoodSheet", "BadSheet"]
mock_excel_instance.parse.side_effect = [mock_df_success, Exception("Parse error")]
mock_excel_instance.parse.side_effect = [df, Exception("Parse error")]
mock_excel_file.return_value = mock_excel_instance
file_content = b"fake_excel_mixed_content"
result = _extract_text_from_excel(file_content)
expected = "| Data | Value |\n|------|-------|\n| Test | 123 |\n\n"
assert result == expected
expected_manual = "| Data | Value |\n| ---- | ----- |\n| Test | 123 |\n\n"
assert expected_manual == result
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_file_error(mock_excel_file):
"""Test handling of Excel file reading errors."""
mock_excel_file.side_effect = Exception("Invalid Excel file")
file_content = b"invalid_excel_content"
with pytest.raises(Exception) as exc_info:
_extract_text_from_excel(file_content)
# Note: The function should raise TextExtractionError, but since it's not imported in the test,
# we check for the general Exception pattern
assert "Failed to extract text from Excel file" in str(exc_info.value)
assert mock_excel_instance.parse.call_count == 2
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_io_bytesio_usage(mock_excel_file):
"""Test that BytesIO is properly used with the file content."""
import io
# Mock DataFrame
mock_df = Mock()
mock_df.dropna = Mock()
mock_df.to_markdown.return_value = "| Test | Data |\n|------|------|\n| 1 | A |"
# Test bytesio
data = {"Test": [1], "Data": ["A"]}
df = pd.DataFrame(data)
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["TestSheet"]
mock_excel_instance.parse.return_value = mock_df
mock_excel_instance.parse.return_value = df
mock_excel_file.return_value = mock_excel_instance
file_content = b"test_excel_bytes"
result = _extract_text_from_excel(file_content)
# Verify that ExcelFile was called with a BytesIO object
mock_excel_file.assert_called_once()
call_args = mock_excel_file.call_args[0][0]
assert isinstance(call_args, io.BytesIO)
call_arg = mock_excel_file.call_args[0][0]
assert isinstance(call_arg, io.BytesIO)
expected = "| Test | Data |\n|------|------|\n| 1 | A |\n\n"
assert result == expected
expected_manual = "| Test | Data |\n| ---- | ---- |\n| 1 | A |\n\n"
assert expected_manual == result
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_all_sheets_fail(mock_excel_file):
"""Test when all sheets fail to parse - should return empty string."""
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["BadSheet1", "BadSheet2"]
@ -335,29 +326,6 @@ def test_extract_text_from_excel_all_sheets_fail(mock_excel_file):
file_content = b"fake_excel_all_bad_sheets"
result = _extract_text_from_excel(file_content)
# Should return empty string when all sheets fail
assert result == ""
@patch("pandas.ExcelFile")
def test_extract_text_from_excel_markdown_formatting(mock_excel_file):
"""Test that markdown formatting parameters are correctly applied."""
# Mock DataFrame
mock_df = Mock()
mock_df.dropna = Mock()
mock_df.to_markdown.return_value = "| Float | Int |\n|-------|-----|\n| 123456.78 | 42 |"
# Mock ExcelFile
mock_excel_instance = Mock()
mock_excel_instance.sheet_names = ["NumberSheet"]
mock_excel_instance.parse.return_value = mock_df
mock_excel_file.return_value = mock_excel_instance
file_content = b"fake_excel_numbers"
result = _extract_text_from_excel(file_content)
# Verify to_markdown was called with correct parameters
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt="")
expected = "| Float | Int |\n|-------|-----|\n| 123456.78 | 42 |\n\n"
assert result == expected
assert mock_excel_instance.parse.call_count == 2