Merge remote-tracking branch 'origin/main' into feat/trigger

2026-02-25 20:26:31 +08:00 · 2025-09-25 17:14:24 +08:00
parent ef9a741781 407323f817
commit cd07eef639
3013 changed files with 148826 additions and 44294 deletions
--- a/api/models/workflow.py
+++ b/api/models/workflow.py
@ -2,26 +2,28 @@ import json
 import logging
 from collections.abc import Generator, Mapping, Sequence
 from datetime import datetime
-from enum import Enum, StrEnum
-from typing import TYPE_CHECKING, Any, Optional, Union
+from enum import StrEnum
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
 from uuid import uuid4

 import sqlalchemy as sa
-from sqlalchemy import DateTime, exists, orm, select
+from sqlalchemy import DateTime, Select, exists, orm, select

 from core.file.constants import maybe_file_object
 from core.file.models import File
 from core.variables import utils as variable_utils
 from core.variables.variables import FloatVariable, IntegerVariable, StringVariable
 from core.workflow.constants import CONVERSATION_VARIABLE_NODE_ID, SYSTEM_VARIABLE_NODE_ID
-from core.workflow.nodes.enums import NodeType
+from core.workflow.enums import NodeType
+from extensions.ext_storage import Storage
 from factories.variable_factory import TypeMismatchError, build_segment_with_type
 from libs.datetime_utils import naive_utc_now
+from libs.uuid_utils import uuidv7

 from ._workflow_exc import NodeNotFoundError, WorkflowDataError

 if TYPE_CHECKING:
-    from models.model import AppMode
+    from models.model import AppMode, UploadFile

 from sqlalchemy import Index, PrimaryKeyConstraint, String, UniqueConstraint, func
 from sqlalchemy.orm import Mapped, declared_attr, mapped_column
@ -35,19 +37,20 @@ from libs import helper
 from .account import Account
 from .base import Base
 from .engine import db
-from .enums import CreatorUserRole, DraftVariableType
+from .enums import CreatorUserRole, DraftVariableType, ExecutionOffLoadType
 from .types import EnumText, StringUUID

 logger = logging.getLogger(__name__)


-class WorkflowType(Enum):
+class WorkflowType(StrEnum):
    """
    Workflow Type Enum
    """

    WORKFLOW = "workflow"
    CHAT = "chat"
+    RAG_PIPELINE = "rag-pipeline"

    @classmethod
    def value_of(cls, value: str) -> "WorkflowType":
@ -130,7 +133,7 @@ class Workflow(Base):
    _features: Mapped[str] = mapped_column("features", sa.TEXT)
    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
-    updated_by: Mapped[Optional[str]] = mapped_column(StringUUID)
+    updated_by: Mapped[str | None] = mapped_column(StringUUID)
    updated_at: Mapped[datetime] = mapped_column(
        DateTime,
        nullable=False,
@ -143,6 +146,9 @@ class Workflow(Base):
    _conversation_variables: Mapped[str] = mapped_column(
        "conversation_variables", sa.Text, nullable=False, server_default="{}"
    )
+    _rag_pipeline_variables: Mapped[str] = mapped_column(
+        "rag_pipeline_variables", db.Text, nullable=False, server_default="{}"
+    )

    VERSION_DRAFT = "draft"

@ -159,6 +165,7 @@ class Workflow(Base):
        created_by: str,
        environment_variables: Sequence[Variable],
        conversation_variables: Sequence[Variable],
+        rag_pipeline_variables: list[dict],
        marked_name: str = "",
        marked_comment: str = "",
    ) -> "Workflow":
@ -173,6 +180,7 @@ class Workflow(Base):
        workflow.created_by = created_by
        workflow.environment_variables = environment_variables or []
        workflow.conversation_variables = conversation_variables or []
+        workflow.rag_pipeline_variables = rag_pipeline_variables or []
        workflow.marked_name = marked_name
        workflow.marked_comment = marked_comment
        workflow.created_at = naive_utc_now()
@ -224,7 +232,7 @@ class Workflow(Base):
            raise WorkflowDataError("nodes not found in workflow graph")

        try:
-            node_config = next(filter(lambda node: node["id"] == node_id, nodes))
+            node_config: dict[str, Any] = next(filter(lambda node: node["id"] == node_id, nodes))
        except StopIteration:
            raise NodeNotFoundError(node_id)
        assert isinstance(node_config, dict)
@ -282,7 +290,7 @@ class Workflow(Base):
        return self._features

    @features.setter
-    def features(self, value: str) -> None:
+    def features(self, value: str):
        self._features = value

    @property
@ -337,7 +345,7 @@ class Workflow(Base):
        else:
            yield from ((node["id"], node["data"]) for node in graph_dict["nodes"])

-    def user_input_form(self, to_old_structure: bool = False) -> list:
+    def user_input_form(self, to_old_structure: bool = False) -> list[Any]:
        # get start node from graph
        if not self.graph:
            return []
@ -354,7 +362,7 @@ class Workflow(Base):
        variables: list[Any] = start_node.get("data", {}).get("variables", [])

        if to_old_structure:
-            old_structure_variables = []
+            old_structure_variables: list[dict[str, Any]] = []
            for variable in variables:
                old_structure_variables.append({variable["type"]: variable})

@ -362,6 +370,12 @@ class Workflow(Base):

        return variables

+    def rag_pipeline_user_input_form(self) -> list:
+        # get user_input_form from start node
+        variables: list[Any] = self.rag_pipeline_variables
+
+        return variables
+
    @property
    def unique_hash(self) -> str:
        """
@ -394,9 +408,7 @@ class Workflow(Base):

    @property
    def environment_variables(self) -> Sequence[StringVariable | IntegerVariable | FloatVariable | SecretVariable]:
-        # TODO: find some way to init `self._environment_variables` when instance created.
-        if self._environment_variables is None:
-            self._environment_variables = "{}"
+        # _environment_variables is guaranteed to be non-None due to server_default="{}"

        # Use workflow.tenant_id to avoid relying on request user in background threads
        tenant_id = self.tenant_id
@ -404,23 +416,24 @@ class Workflow(Base):
        if not tenant_id:
            return []

-        environment_variables_dict: dict[str, Any] = json.loads(self._environment_variables)
+        environment_variables_dict: dict[str, Any] = json.loads(self._environment_variables or "{}")
        results = [
            variable_factory.build_environment_variable_from_mapping(v) for v in environment_variables_dict.values()
        ]

        # decrypt secret variables value
-        def decrypt_func(var):
+        def decrypt_func(var: Variable) -> StringVariable | IntegerVariable | FloatVariable | SecretVariable:
            if isinstance(var, SecretVariable):
                return var.model_copy(update={"value": encrypter.decrypt_token(tenant_id=tenant_id, token=var.value)})
            elif isinstance(var, (StringVariable, IntegerVariable, FloatVariable)):
                return var
            else:
-                raise AssertionError("this statement should be unreachable.")
+                # Other variable types are not supported for environment variables
+                raise AssertionError(f"Unexpected variable type for environment variable: {type(var)}")

-        decrypted_results: list[SecretVariable | StringVariable | IntegerVariable | FloatVariable] = list(
-            map(decrypt_func, results)
-        )
+        decrypted_results: list[SecretVariable | StringVariable | IntegerVariable | FloatVariable] = [
+            decrypt_func(var) for var in results
+        ]
        return decrypted_results

    @environment_variables.setter
@ -448,7 +461,7 @@ class Workflow(Base):
                value[i] = origin_variables_dictionary[variable.id].model_copy(update={"name": variable.name})

        # encrypt secret variables value
-        def encrypt_func(var):
+        def encrypt_func(var: Variable) -> Variable:
            if isinstance(var, SecretVariable):
                return var.model_copy(update={"value": encrypter.encrypt_token(tenant_id=tenant_id, token=var.value)})
            else:
@ -473,26 +486,42 @@ class Workflow(Base):
            "features": self.features_dict,
            "environment_variables": [var.model_dump(mode="json") for var in environment_variables],
            "conversation_variables": [var.model_dump(mode="json") for var in self.conversation_variables],
+            "rag_pipeline_variables": self.rag_pipeline_variables,
        }
        return result

    @property
    def conversation_variables(self) -> Sequence[Variable]:
-        # TODO: find some way to init `self._conversation_variables` when instance created.
-        if self._conversation_variables is None:
-            self._conversation_variables = "{}"
+        # _conversation_variables is guaranteed to be non-None due to server_default="{}"

        variables_dict: dict[str, Any] = json.loads(self._conversation_variables)
        results = [variable_factory.build_conversation_variable_from_mapping(v) for v in variables_dict.values()]
        return results

    @conversation_variables.setter
-    def conversation_variables(self, value: Sequence[Variable]) -> None:
+    def conversation_variables(self, value: Sequence[Variable]):
        self._conversation_variables = json.dumps(
            {var.name: var.model_dump() for var in value},
            ensure_ascii=False,
        )

+    @property
+    def rag_pipeline_variables(self) -> list[dict]:
+        # TODO: find some way to init `self._conversation_variables` when instance created.
+        if self._rag_pipeline_variables is None:
+            self._rag_pipeline_variables = "{}"
+
+        variables_dict: dict[str, Any] = json.loads(self._rag_pipeline_variables)
+        results = list(variables_dict.values())
+        return results
+
+    @rag_pipeline_variables.setter
+    def rag_pipeline_variables(self, values: list[dict]) -> None:
+        self._rag_pipeline_variables = json.dumps(
+            {item["variable"]: item for item in values},
+            ensure_ascii=False,
+        )
+
    @staticmethod
    def version_from_datetime(d: datetime) -> str:
        return str(d)
@ -550,18 +579,18 @@ class WorkflowRun(Base):
    type: Mapped[str] = mapped_column(String(255))
    triggered_from: Mapped[str] = mapped_column(String(255))
    version: Mapped[str] = mapped_column(String(255))
-    graph: Mapped[Optional[str]] = mapped_column(sa.Text)
-    inputs: Mapped[Optional[str]] = mapped_column(sa.Text)
+    graph: Mapped[str | None] = mapped_column(sa.Text)
+    inputs: Mapped[str | None] = mapped_column(sa.Text)
    status: Mapped[str] = mapped_column(String(255))  # running, succeeded, failed, stopped, partial-succeeded
-    outputs: Mapped[Optional[str]] = mapped_column(sa.Text, default="{}")
-    error: Mapped[Optional[str]] = mapped_column(sa.Text)
+    outputs: Mapped[str | None] = mapped_column(sa.Text, default="{}")
+    error: Mapped[str | None] = mapped_column(sa.Text)
    elapsed_time: Mapped[float] = mapped_column(sa.Float, nullable=False, server_default=sa.text("0"))
    total_tokens: Mapped[int] = mapped_column(sa.BigInteger, server_default=sa.text("0"))
    total_steps: Mapped[int] = mapped_column(sa.Integer, server_default=sa.text("0"), nullable=True)
    created_by_role: Mapped[str] = mapped_column(String(255))  # account, end_user
    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
-    finished_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
+    finished_at: Mapped[datetime | None] = mapped_column(DateTime)
    exceptions_count: Mapped[int] = mapped_column(sa.Integer, server_default=sa.text("0"), nullable=True)

    @property
@ -625,7 +654,7 @@ class WorkflowRun(Base):
        }

    @classmethod
-    def from_dict(cls, data: dict) -> "WorkflowRun":
+    def from_dict(cls, data: dict[str, Any]) -> "WorkflowRun":
        return cls(
            id=data.get("id"),
            tenant_id=data.get("tenant_id"),
@ -657,9 +686,10 @@ class WorkflowNodeExecutionTriggeredFrom(StrEnum):

    SINGLE_STEP = "single-step"
    WORKFLOW_RUN = "workflow-run"
+    RAG_PIPELINE_RUN = "rag-pipeline-run"


-class WorkflowNodeExecutionModel(Base):
+class WorkflowNodeExecutionModel(Base):  # This model is expected to have `offload_data` preloaded in most cases.
    """
    Workflow Node Execution

@ -710,7 +740,8 @@ class WorkflowNodeExecutionModel(Base):
    __tablename__ = "workflow_node_executions"

    @declared_attr
-    def __table_args__(cls):  # noqa
+    @classmethod
+    def __table_args__(cls) -> Any:
        return (
            PrimaryKeyConstraint("id", name="workflow_node_execution_pkey"),
            Index(
@ -747,7 +778,7 @@ class WorkflowNodeExecutionModel(Base):
                # MyPy may flag the following line because it doesn't recognize that
                # the `declared_attr` decorator passes the receiving class as the first
                # argument to this method, allowing us to reference class attributes.
-                cls.created_at.desc(),  # type: ignore
+                cls.created_at.desc(),
            ),
        )

@ -756,24 +787,50 @@ class WorkflowNodeExecutionModel(Base):
    app_id: Mapped[str] = mapped_column(StringUUID)
    workflow_id: Mapped[str] = mapped_column(StringUUID)
    triggered_from: Mapped[str] = mapped_column(String(255))
-    workflow_run_id: Mapped[Optional[str]] = mapped_column(StringUUID)
+    workflow_run_id: Mapped[str | None] = mapped_column(StringUUID)
    index: Mapped[int] = mapped_column(sa.Integer)
-    predecessor_node_id: Mapped[Optional[str]] = mapped_column(String(255))
-    node_execution_id: Mapped[Optional[str]] = mapped_column(String(255))
+    predecessor_node_id: Mapped[str | None] = mapped_column(String(255))
+    node_execution_id: Mapped[str | None] = mapped_column(String(255))
    node_id: Mapped[str] = mapped_column(String(255))
    node_type: Mapped[str] = mapped_column(String(255))
    title: Mapped[str] = mapped_column(String(255))
-    inputs: Mapped[Optional[str]] = mapped_column(sa.Text)
-    process_data: Mapped[Optional[str]] = mapped_column(sa.Text)
-    outputs: Mapped[Optional[str]] = mapped_column(sa.Text)
+    inputs: Mapped[str | None] = mapped_column(sa.Text)
+    process_data: Mapped[str | None] = mapped_column(sa.Text)
+    outputs: Mapped[str | None] = mapped_column(sa.Text)
    status: Mapped[str] = mapped_column(String(255))
-    error: Mapped[Optional[str]] = mapped_column(sa.Text)
+    error: Mapped[str | None] = mapped_column(sa.Text)
    elapsed_time: Mapped[float] = mapped_column(sa.Float, server_default=sa.text("0"))
-    execution_metadata: Mapped[Optional[str]] = mapped_column(sa.Text)
+    execution_metadata: Mapped[str | None] = mapped_column(sa.Text)
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.current_timestamp())
    created_by_role: Mapped[str] = mapped_column(String(255))
    created_by: Mapped[str] = mapped_column(StringUUID)
-    finished_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
+    finished_at: Mapped[datetime | None] = mapped_column(DateTime)
+
+    offload_data: Mapped[list["WorkflowNodeExecutionOffload"]] = orm.relationship(
+        "WorkflowNodeExecutionOffload",
+        primaryjoin="WorkflowNodeExecutionModel.id == foreign(WorkflowNodeExecutionOffload.node_execution_id)",
+        uselist=True,
+        lazy="raise",
+        back_populates="execution",
+    )
+
+    @staticmethod
+    def preload_offload_data(
+        query: Select[tuple["WorkflowNodeExecutionModel"]] | orm.Query["WorkflowNodeExecutionModel"],
+    ):
+        return query.options(orm.selectinload(WorkflowNodeExecutionModel.offload_data))
+
+    @staticmethod
+    def preload_offload_data_and_files(
+        query: Select[tuple["WorkflowNodeExecutionModel"]] | orm.Query["WorkflowNodeExecutionModel"],
+    ):
+        return query.options(
+            orm.selectinload(WorkflowNodeExecutionModel.offload_data).options(
+                # Using `joinedload` instead of `selectinload` to minimize database roundtrips,
+                # as `selectinload` would require separate queries for `inputs_file` and `outputs_file`.
+                orm.selectinload(WorkflowNodeExecutionOffload.file),
+            )
+        )

    @property
    def created_by_account(self):
@ -809,25 +866,148 @@ class WorkflowNodeExecutionModel(Base):
        return json.loads(self.execution_metadata) if self.execution_metadata else {}

    @property
-    def extras(self):
+    def extras(self) -> dict[str, Any]:
        from core.tools.tool_manager import ToolManager

-        extras = {}
+        extras: dict[str, Any] = {}
        if self.execution_metadata_dict:
            from core.workflow.nodes import NodeType

            if self.node_type == NodeType.TOOL.value and "tool_info" in self.execution_metadata_dict:
-                tool_info = self.execution_metadata_dict["tool_info"]
+                tool_info: dict[str, Any] = self.execution_metadata_dict["tool_info"]
                extras["icon"] = ToolManager.get_tool_icon(
                    tenant_id=self.tenant_id,
                    provider_type=tool_info["provider_type"],
                    provider_id=tool_info["provider_id"],
                )
-
+            elif self.node_type == NodeType.DATASOURCE.value and "datasource_info" in self.execution_metadata_dict:
+                datasource_info = self.execution_metadata_dict["datasource_info"]
+                extras["icon"] = datasource_info.get("icon")
        return extras

+    def _get_offload_by_type(self, type_: ExecutionOffLoadType) -> Optional["WorkflowNodeExecutionOffload"]:
+        return next(iter([i for i in self.offload_data if i.type_ == type_]), None)

-class WorkflowAppLogCreatedFrom(Enum):
+    @property
+    def inputs_truncated(self) -> bool:
+        """Check if inputs were truncated (offloaded to external storage)."""
+        return self._get_offload_by_type(ExecutionOffLoadType.INPUTS) is not None
+
+    @property
+    def outputs_truncated(self) -> bool:
+        """Check if outputs were truncated (offloaded to external storage)."""
+        return self._get_offload_by_type(ExecutionOffLoadType.OUTPUTS) is not None
+
+    @property
+    def process_data_truncated(self) -> bool:
+        """Check if process_data were truncated (offloaded to external storage)."""
+        return self._get_offload_by_type(ExecutionOffLoadType.PROCESS_DATA) is not None
+
+    @staticmethod
+    def _load_full_content(session: orm.Session, file_id: str, storage: Storage):
+        from .model import UploadFile
+
+        stmt = sa.select(UploadFile).where(UploadFile.id == file_id)
+        file = session.scalars(stmt).first()
+        assert file is not None, f"UploadFile with id {file_id} should exist but not"
+        content = storage.load(file.key)
+        return json.loads(content)
+
+    def load_full_inputs(self, session: orm.Session, storage: Storage) -> Mapping[str, Any] | None:
+        offload = self._get_offload_by_type(ExecutionOffLoadType.INPUTS)
+        if offload is None:
+            return self.inputs_dict
+
+        return self._load_full_content(session, offload.file_id, storage)
+
+    def load_full_outputs(self, session: orm.Session, storage: Storage) -> Mapping[str, Any] | None:
+        offload: WorkflowNodeExecutionOffload | None = self._get_offload_by_type(ExecutionOffLoadType.OUTPUTS)
+        if offload is None:
+            return self.outputs_dict
+
+        return self._load_full_content(session, offload.file_id, storage)
+
+    def load_full_process_data(self, session: orm.Session, storage: Storage) -> Mapping[str, Any] | None:
+        offload: WorkflowNodeExecutionOffload | None = self._get_offload_by_type(ExecutionOffLoadType.PROCESS_DATA)
+        if offload is None:
+            return self.process_data_dict
+
+        return self._load_full_content(session, offload.file_id, storage)
+
+
+class WorkflowNodeExecutionOffload(Base):
+    __tablename__ = "workflow_node_execution_offload"
+    __table_args__ = (
+        # PostgreSQL 14 treats NULL values as distinct in unique constraints by default,
+        # allowing multiple records with NULL values for the same column combination.
+        #
+        # This behavior allows us to have multiple records with NULL node_execution_id,
+        # simplifying garbage collection process.
+        UniqueConstraint(
+            "node_execution_id",
+            "type",
+            # Note: PostgreSQL 15+ supports explicit `nulls distinct` behavior through
+            # `postgresql_nulls_not_distinct=False`, which would make our intention clearer.
+            # We rely on PostgreSQL's default behavior of treating NULLs as distinct values.
+            # postgresql_nulls_not_distinct=False,
+        ),
+    )
+    _HASH_COL_SIZE = 64
+
+    id: Mapped[str] = mapped_column(
+        StringUUID,
+        primary_key=True,
+        server_default=sa.text("uuidv7()"),
+    )
+
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, default=naive_utc_now, server_default=func.current_timestamp()
+    )
+
+    tenant_id: Mapped[str] = mapped_column(StringUUID)
+    app_id: Mapped[str] = mapped_column(StringUUID)
+
+    # `node_execution_id` indicates the `WorkflowNodeExecutionModel` associated with this offload record.
+    # A value of `None` signifies that this offload record is not linked to any execution record
+    # and should be considered for garbage collection.
+    node_execution_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
+    type_: Mapped[ExecutionOffLoadType] = mapped_column(EnumText(ExecutionOffLoadType), name="type", nullable=False)
+
+    # Design Decision: Combining inputs and outputs into a single object was considered to reduce I/O
+    # operations. However, due to the current design of `WorkflowNodeExecutionRepository`,
+    # the `save` method is called at two distinct times:
+    #
+    # - When the node starts execution: the `inputs` field exists, but the `outputs` field is absent
+    # - When the node completes execution (either succeeded or failed): the `outputs` field becomes available
+    #
+    # It's difficult to correlate these two successive calls to `save` for combined storage.
+    # Converting the `WorkflowNodeExecutionRepository` to buffer the first `save` call and flush
+    # when execution completes was also considered, but this would make the execution state unobservable
+    # until completion, significantly damaging the observability of workflow execution.
+    #
+    # Given these constraints, `inputs` and `outputs` are stored separately to maintain real-time
+    # observability and system reliability.
+
+    # `file_id` references to the offloaded storage object containing the data.
+    file_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
+
+    execution: Mapped[WorkflowNodeExecutionModel] = orm.relationship(
+        foreign_keys=[node_execution_id],
+        lazy="raise",
+        uselist=False,
+        primaryjoin="WorkflowNodeExecutionOffload.node_execution_id == WorkflowNodeExecutionModel.id",
+        back_populates="offload_data",
+    )
+
+    file: Mapped[Optional["UploadFile"]] = orm.relationship(
+        foreign_keys=[file_id],
+        lazy="raise",
+        uselist=False,
+        primaryjoin="WorkflowNodeExecutionOffload.file_id == UploadFile.id",
+    )
+
+
+class WorkflowAppLogCreatedFrom(StrEnum):
    """
    Workflow App Log Created From Enum
    """
@ -883,6 +1063,7 @@ class WorkflowAppLog(Base):
    __table_args__ = (
        sa.PrimaryKeyConstraint("id", name="workflow_app_log_pkey"),
        sa.Index("workflow_app_log_app_idx", "tenant_id", "app_id"),
+        sa.Index("workflow_app_log_workflow_run_id_idx", "workflow_run_id"),
    )

    id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
@ -939,7 +1120,7 @@ class ConversationVariable(Base):
        DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp()
    )

-    def __init__(self, *, id: str, app_id: str, conversation_id: str, data: str) -> None:
+    def __init__(self, *, id: str, app_id: str, conversation_id: str, data: str):
        self.id = id
        self.app_id = app_id
        self.conversation_id = conversation_id
@ -988,7 +1169,10 @@ class WorkflowDraftVariable(Base):
        ]

    __tablename__ = "workflow_draft_variables"
-    __table_args__ = (UniqueConstraint(*unique_app_id_node_id_name()),)
+    __table_args__ = (
+        UniqueConstraint(*unique_app_id_node_id_name()),
+        Index("workflow_draft_variable_file_id_idx", "file_id"),
+    )
    # Required for instance variable annotation.
    __allow_unmapped__ = True

@ -1049,9 +1233,16 @@ class WorkflowDraftVariable(Base):
    selector: Mapped[str] = mapped_column(sa.String(255), nullable=False, name="selector")

    # The data type of this variable's value
+    #
+    # If the variable is offloaded, `value_type` represents the type of the truncated value,
+    # which may differ from the original value's type. Typically, they are the same,
+    # but in cases where the structurally truncated  value still exceeds the size limit,
+    # text slicing is applied, and the `value_type` is converted to `STRING`.
    value_type: Mapped[SegmentType] = mapped_column(EnumText(SegmentType, length=20))

    # The variable's value serialized as a JSON string
+    #
+    # If the variable is offloaded, `value` contains a truncated version, not the full original value.
    value: Mapped[str] = mapped_column(sa.Text, nullable=False, name="value")

    # Controls whether the variable should be displayed in the variable inspection panel
@ -1071,6 +1262,35 @@ class WorkflowDraftVariable(Base):
        default=None,
    )

+    # Reference to WorkflowDraftVariableFile for offloaded large variables
+    #
+    # Indicates whether the current draft variable is offloaded.
+    # If not offloaded, this field will be None.
+    file_id: Mapped[str | None] = mapped_column(
+        StringUUID,
+        nullable=True,
+        default=None,
+        comment="Reference to WorkflowDraftVariableFile if variable is offloaded to external storage",
+    )
+
+    is_default_value: Mapped[bool] = mapped_column(
+        sa.Boolean,
+        nullable=False,
+        default=False,
+        comment=(
+            "Indicates whether the current value is the default for a conversation variable. "
+            "Always `FALSE` for other types of variables."
+        ),
+    )
+
+    # Relationship to WorkflowDraftVariableFile
+    variable_file: Mapped[Optional["WorkflowDraftVariableFile"]] = orm.relationship(
+        foreign_keys=[file_id],
+        lazy="raise",
+        uselist=False,
+        primaryjoin="WorkflowDraftVariableFile.id == WorkflowDraftVariable.file_id",
+    )
+
    # Cache for deserialized value
    #
    # NOTE(QuantumGhost): This field serves two purposes:
@ -1084,7 +1304,7 @@ class WorkflowDraftVariable(Base):
    # making this attribute harder to access from outside the class.
    __value: Segment | None

-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
        """
        The constructor of `WorkflowDraftVariable` is not intended for
        direct use outside this file. Its solo purpose is setup private state
@ -1102,15 +1322,15 @@ class WorkflowDraftVariable(Base):
        self.__value = None

    def get_selector(self) -> list[str]:
-        selector = json.loads(self.selector)
+        selector: Any = json.loads(self.selector)
        if not isinstance(selector, list):
            logger.error(
                "invalid selector loaded from database, type=%s, value=%s",
-                type(selector),
+                type(selector).__name__,
                self.selector,
            )
            raise ValueError("invalid selector.")
-        return selector
+        return cast(list[str], selector)

    def _set_selector(self, value: list[str]):
        self.selector = json.dumps(value)
@ -1120,7 +1340,7 @@ class WorkflowDraftVariable(Base):
        return self.build_segment_with_type(self.value_type, value)

    @staticmethod
-    def rebuild_file_types(value: Any) -> Any:
+    def rebuild_file_types(value: Any):
        # NOTE(QuantumGhost): Temporary workaround for structured data handling.
        # By this point, `output` has been converted to dict by
        # `WorkflowEntry.handle_special_values`, so we need to
@ -1133,15 +1353,17 @@ class WorkflowDraftVariable(Base):
        # `WorkflowEntry.handle_special_values`, making a comprehensive migration challenging.
        if isinstance(value, dict):
            if not maybe_file_object(value):
-                return value
+                return cast(Any, value)
            return File.model_validate(value)
        elif isinstance(value, list) and value:
-            first = value[0]
+            value_list = cast(list[Any], value)
+            first: Any = value_list[0]
            if not maybe_file_object(first):
-                return value
-            return [File.model_validate(i) for i in value]
+                return cast(Any, value)
+            file_list: list[File] = [File.model_validate(cast(dict[str, Any], i)) for i in value_list]
+            return cast(Any, file_list)
        else:
-            return value
+            return cast(Any, value)

    @classmethod
    def build_segment_with_type(cls, segment_type: SegmentType, value: Any) -> Segment:
@ -1218,6 +1440,9 @@ class WorkflowDraftVariable(Base):
            case _:
                return DraftVariableType.NODE

+    def is_truncated(self) -> bool:
+        return self.file_id is not None
+
    @classmethod
    def _new(
        cls,
@ -1228,6 +1453,7 @@ class WorkflowDraftVariable(Base):
        value: Segment,
        node_execution_id: str | None,
        description: str = "",
+        file_id: str | None = None,
    ) -> "WorkflowDraftVariable":
        variable = WorkflowDraftVariable()
        variable.created_at = _naive_utc_datetime()
@ -1237,6 +1463,7 @@ class WorkflowDraftVariable(Base):
        variable.node_id = node_id
        variable.name = name
        variable.set_value(value)
+        variable.file_id = file_id
        variable._set_selector(list(variable_utils.to_selector(node_id, name)))
        variable.node_execution_id = node_execution_id
        return variable
@ -1292,6 +1519,7 @@ class WorkflowDraftVariable(Base):
        node_execution_id: str,
        visible: bool = True,
        editable: bool = True,
+        file_id: str | None = None,
    ) -> "WorkflowDraftVariable":
        variable = cls._new(
            app_id=app_id,
@ -1299,6 +1527,7 @@ class WorkflowDraftVariable(Base):
            name=name,
            node_execution_id=node_execution_id,
            value=value,
+            file_id=file_id,
        )
        variable.visible = visible
        variable.editable = editable
@ -1309,6 +1538,93 @@ class WorkflowDraftVariable(Base):
        return self.last_edited_at is not None


+class WorkflowDraftVariableFile(Base):
+    """Stores metadata about files associated with large workflow draft variables.
+
+    This model acts as an intermediary between WorkflowDraftVariable and UploadFile,
+    allowing for proper cleanup of orphaned files when variables are updated or deleted.
+
+    The MIME type of the stored content is recorded in `UploadFile.mime_type`.
+    Possible values are 'application/json' for JSON types other than plain text,
+    and 'text/plain' for JSON strings.
+    """
+
+    __tablename__ = "workflow_draft_variable_files"
+
+    # Primary key
+    id: Mapped[str] = mapped_column(
+        StringUUID,
+        primary_key=True,
+        default=uuidv7,
+        server_default=sa.text("uuidv7()"),
+    )
+
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime,
+        nullable=False,
+        default=_naive_utc_datetime,
+        server_default=func.current_timestamp(),
+    )
+
+    tenant_id: Mapped[str] = mapped_column(
+        StringUUID,
+        nullable=False,
+        comment="The tenant to which the WorkflowDraftVariableFile belongs, referencing Tenant.id",
+    )
+
+    app_id: Mapped[str] = mapped_column(
+        StringUUID,
+        nullable=False,
+        comment="The application to which the WorkflowDraftVariableFile belongs, referencing App.id",
+    )
+
+    user_id: Mapped[str] = mapped_column(
+        StringUUID,
+        nullable=False,
+        comment="The owner to of the WorkflowDraftVariableFile, referencing Account.id",
+    )
+
+    # Reference to the `UploadFile.id` field
+    upload_file_id: Mapped[str] = mapped_column(
+        StringUUID,
+        nullable=False,
+        comment="Reference to UploadFile containing the large variable data",
+    )
+
+    # -------------- metadata about the variable content --------------
+
+    # The `size` is already recorded in UploadFiles. It is duplicated here to avoid an additional database lookup.
+    size: Mapped[int | None] = mapped_column(
+        sa.BigInteger,
+        nullable=False,
+        comment="Size of the original variable content in bytes",
+    )
+
+    length: Mapped[int | None] = mapped_column(
+        sa.Integer,
+        nullable=True,
+        comment=(
+            "Length of the original variable content. For array and array-like types, "
+            "this represents the number of elements. For object types, it indicates the number of keys. "
+            "For other types, the value is NULL."
+        ),
+    )
+
+    # The `value_type` field records the type of the original value.
+    value_type: Mapped[SegmentType] = mapped_column(
+        EnumText(SegmentType, length=20),
+        nullable=False,
+    )
+
+    # Relationship to UploadFile
+    upload_file: Mapped["UploadFile"] = orm.relationship(
+        foreign_keys=[upload_file_id],
+        lazy="raise",
+        uselist=False,
+        primaryjoin="WorkflowDraftVariableFile.upload_file_id == UploadFile.id",
+    )
+
+
 def is_system_variable_editable(name: str) -> bool:
    return name in _EDITABLE_SYSTEM_VARIABLE