refactor: move workflow package to dify_graph (#32844)

2026-04-22 03:37:44 +08:00 · 2026-03-02 18:42:30 +08:00
parent 9c33923985
commit c917838f9c
613 changed files with 2008 additions and 2012 deletions
--- a/api/dify_graph/graph_engine/layers/README.md
+++ b/api/dify_graph/graph_engine/layers/README.md
@ -0,0 +1,55 @@
+# Layers
+
+Pluggable middleware for engine extensions.
+
+## Components
+
+### Layer (base)
+
+Abstract base class for layers.
+
+- `initialize()` - Receive runtime context (runtime state is bound here and always available to hooks)
+- `on_graph_start()` - Execution start hook
+- `on_event()` - Process all events
+- `on_graph_end()` - Execution end hook
+
+### DebugLoggingLayer
+
+Comprehensive execution logging.
+
+- Configurable detail levels
+- Tracks execution statistics
+- Truncates long values
+
+## Usage
+
+```python
+debug_layer = DebugLoggingLayer(
+    level="INFO",
+    include_outputs=True
+)
+
+engine = GraphEngine(graph)
+engine.layer(debug_layer)
+engine.run()
+```
+
+`engine.layer()` binds the read-only runtime state before execution, so
+`graph_runtime_state` is always available inside layer hooks.
+
+## Custom Layers
+
+```python
+class MetricsLayer(Layer):
+    def on_event(self, event):
+        if isinstance(event, NodeRunSucceededEvent):
+            self.metrics[event.node_id] = event.elapsed_time
+```
+
+## Configuration
+
+**DebugLoggingLayer Options:**
+
+- `level` - Log level (INFO, DEBUG, ERROR)
+- `include_inputs/outputs` - Log data values
+- `max_value_length` - Truncate long values
--- a/api/dify_graph/graph_engine/layers/init.py
+++ b/api/dify_graph/graph_engine/layers/init.py
@ -0,0 +1,16 @@
+"""
+Layer system for GraphEngine extensibility.
+
+This module provides the layer infrastructure for extending GraphEngine functionality
+with middleware-like components that can observe events and interact with execution.
+"""
+
+from .base import GraphEngineLayer
+from .debug_logging import DebugLoggingLayer
+from .execution_limits import ExecutionLimitsLayer
+
+__all__ = [
+    "DebugLoggingLayer",
+    "ExecutionLimitsLayer",
+    "GraphEngineLayer",
+]
--- a/api/dify_graph/graph_engine/layers/base.py
+++ b/api/dify_graph/graph_engine/layers/base.py
@ -0,0 +1,128 @@
+"""
+Base layer class for GraphEngine extensions.
+
+This module provides the abstract base class for implementing layers that can
+intercept and respond to GraphEngine events.
+"""
+
+from abc import ABC, abstractmethod
+
+from dify_graph.graph_engine.protocols.command_channel import CommandChannel
+from dify_graph.graph_events import GraphEngineEvent, GraphNodeEventBase
+from dify_graph.nodes.base.node import Node
+from dify_graph.runtime import ReadOnlyGraphRuntimeState
+
+
+class GraphEngineLayerNotInitializedError(Exception):
+    """Raised when a layer's runtime state is accessed before initialization."""
+
+    def __init__(self, layer_name: str | None = None) -> None:
+        name = layer_name or "GraphEngineLayer"
+        super().__init__(f"{name} runtime state is not initialized. Bind the layer to a GraphEngine before access.")
+
+
+class GraphEngineLayer(ABC):
+    """
+    Abstract base class for GraphEngine layers.
+
+    Layers are middleware-like components that can:
+    - Observe all events emitted by the GraphEngine
+    - Access the graph runtime state
+    - Send commands to control execution
+
+    Subclasses should override the constructor to accept configuration parameters,
+    then implement the three lifecycle methods.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the layer. Subclasses can override with custom parameters."""
+        self._graph_runtime_state: ReadOnlyGraphRuntimeState | None = None
+        self.command_channel: CommandChannel | None = None
+
+    @property
+    def graph_runtime_state(self) -> ReadOnlyGraphRuntimeState:
+        if self._graph_runtime_state is None:
+            raise GraphEngineLayerNotInitializedError(type(self).__name__)
+        return self._graph_runtime_state
+
+    def initialize(self, graph_runtime_state: ReadOnlyGraphRuntimeState, command_channel: CommandChannel) -> None:
+        """
+        Initialize the layer with engine dependencies.
+
+        Called by GraphEngine to inject the read-only runtime state and command channel.
+        This is invoked when the layer is registered with a `GraphEngine` instance.
+        Implementations should be idempotent.
+        Args:
+            graph_runtime_state: Read-only view of the runtime state
+            command_channel: Channel for sending commands to the engine
+        """
+        self._graph_runtime_state = graph_runtime_state
+        self.command_channel = command_channel
+
+    @abstractmethod
+    def on_graph_start(self) -> None:
+        """
+        Called when graph execution starts.
+
+        This is called after the engine has been initialized but before any nodes
+        are executed. Layers can use this to set up resources or log start information.
+        """
+        pass
+
+    @abstractmethod
+    def on_event(self, event: GraphEngineEvent) -> None:
+        """
+        Called for every event emitted by the engine.
+
+        This method receives all events generated during graph execution, including:
+        - Graph lifecycle events (start, success, failure)
+        - Node execution events (start, success, failure, retry)
+        - Stream events for response nodes
+        - Container events (iteration, loop)
+
+        Args:
+            event: The event emitted by the engine
+        """
+        pass
+
+    @abstractmethod
+    def on_graph_end(self, error: Exception | None) -> None:
+        """
+        Called when graph execution ends.
+
+        This is called after all nodes have been executed or when execution is
+        aborted. Layers can use this to clean up resources or log final state.
+
+        Args:
+            error: The exception that caused execution to fail, or None if successful
+        """
+        pass
+
+    def on_node_run_start(self, node: Node) -> None:
+        """
+        Called immediately before a node begins execution.
+
+        Layers can override to inject behavior (e.g., start spans) prior to node execution.
+        The node's execution ID is available via `node._node_execution_id` and will be
+        consistent with all events emitted by this node execution.
+
+        Args:
+            node: The node instance about to be executed
+        """
+        return
+
+    def on_node_run_end(
+        self, node: Node, error: Exception | None, result_event: GraphNodeEventBase | None = None
+    ) -> None:
+        """
+        Called after a node finishes execution.
+
+        The node's execution ID is available via `node._node_execution_id` and matches
+        the `id` field in all events emitted by this node execution.
+
+        Args:
+            node: The node instance that just finished execution
+            error: Exception instance if the node failed, otherwise None
+            result_event: The final result event from node execution (succeeded/failed/paused), if any
+        """
+        return
--- a/api/dify_graph/graph_engine/layers/debug_logging.py
+++ b/api/dify_graph/graph_engine/layers/debug_logging.py
@ -0,0 +1,247 @@
+"""
+Debug logging layer for GraphEngine.
+
+This module provides a layer that logs all events and state changes during
+graph execution for debugging purposes.
+"""
+
+import logging
+from collections.abc import Mapping
+from typing import Any, final
+
+from typing_extensions import override
+
+from dify_graph.graph_events import (
+    GraphEngineEvent,
+    GraphRunAbortedEvent,
+    GraphRunFailedEvent,
+    GraphRunPartialSucceededEvent,
+    GraphRunStartedEvent,
+    GraphRunSucceededEvent,
+    NodeRunExceptionEvent,
+    NodeRunFailedEvent,
+    NodeRunIterationFailedEvent,
+    NodeRunIterationNextEvent,
+    NodeRunIterationStartedEvent,
+    NodeRunIterationSucceededEvent,
+    NodeRunLoopFailedEvent,
+    NodeRunLoopNextEvent,
+    NodeRunLoopStartedEvent,
+    NodeRunLoopSucceededEvent,
+    NodeRunRetryEvent,
+    NodeRunStartedEvent,
+    NodeRunStreamChunkEvent,
+    NodeRunSucceededEvent,
+)
+
+from .base import GraphEngineLayer
+
+
+@final
+class DebugLoggingLayer(GraphEngineLayer):
+    """
+    A layer that provides comprehensive logging of GraphEngine execution.
+
+    This layer logs all events with configurable detail levels, helping developers
+    debug workflow execution and understand the flow of events.
+    """
+
+    def __init__(
+        self,
+        level: str = "INFO",
+        include_inputs: bool = False,
+        include_outputs: bool = True,
+        include_process_data: bool = False,
+        logger_name: str = "GraphEngine.Debug",
+        max_value_length: int = 500,
+    ) -> None:
+        """
+        Initialize the debug logging layer.
+
+        Args:
+            level: Logging level (DEBUG, INFO, WARNING, ERROR)
+            include_inputs: Whether to log node input values
+            include_outputs: Whether to log node output values
+            include_process_data: Whether to log node process data
+            logger_name: Name of the logger to use
+            max_value_length: Maximum length of logged values (truncated if longer)
+        """
+        super().__init__()
+        self.level = level
+        self.include_inputs = include_inputs
+        self.include_outputs = include_outputs
+        self.include_process_data = include_process_data
+        self.max_value_length = max_value_length
+
+        # Set up logger
+        self.logger = logging.getLogger(logger_name)
+        log_level = getattr(logging, level.upper(), logging.INFO)
+        self.logger.setLevel(log_level)
+
+        # Track execution stats
+        self.node_count = 0
+        self.success_count = 0
+        self.failure_count = 0
+        self.retry_count = 0
+
+    def _truncate_value(self, value: Any) -> str:
+        """Truncate long values for logging."""
+        str_value = str(value)
+        if len(str_value) > self.max_value_length:
+            return str_value[: self.max_value_length] + "... (truncated)"
+        return str_value
+
+    def _format_dict(self, data: dict[str, Any] | Mapping[str, Any]) -> str:
+        """Format a dictionary or mapping for logging with truncation."""
+        if not data:
+            return "{}"
+
+        formatted_items: list[str] = []
+        for key, value in data.items():
+            formatted_value = self._truncate_value(value)
+            formatted_items.append(f"  {key}: {formatted_value}")
+
+        return "{\n" + ",\n".join(formatted_items) + "\n}"
+
+    @override
+    def on_graph_start(self) -> None:
+        """Log graph execution start."""
+        self.logger.info("=" * 80)
+        self.logger.info("🚀 GRAPH EXECUTION STARTED")
+        self.logger.info("=" * 80)
+        # Log initial state
+        self.logger.info("Initial State:")
+
+    @override
+    def on_event(self, event: GraphEngineEvent) -> None:
+        """Log individual events based on their type."""
+        event_class = event.__class__.__name__
+
+        # Graph-level events
+        if isinstance(event, GraphRunStartedEvent):
+            self.logger.debug("Graph run started event")
+
+        elif isinstance(event, GraphRunSucceededEvent):
+            self.logger.info("✅ Graph run succeeded")
+            if self.include_outputs and event.outputs:
+                self.logger.info("  Final outputs: %s", self._format_dict(event.outputs))
+
+        elif isinstance(event, GraphRunPartialSucceededEvent):
+            self.logger.warning("⚠️ Graph run partially succeeded")
+            if event.exceptions_count > 0:
+                self.logger.warning("  Total exceptions: %s", event.exceptions_count)
+            if self.include_outputs and event.outputs:
+                self.logger.info("  Final outputs: %s", self._format_dict(event.outputs))
+
+        elif isinstance(event, GraphRunFailedEvent):
+            self.logger.error("❌ Graph run failed: %s", event.error)
+            if event.exceptions_count > 0:
+                self.logger.error("  Total exceptions: %s", event.exceptions_count)
+
+        elif isinstance(event, GraphRunAbortedEvent):
+            self.logger.warning("⚠️ Graph run aborted: %s", event.reason)
+            if event.outputs:
+                self.logger.info("  Partial outputs: %s", self._format_dict(event.outputs))
+
+        # Node-level events
+        # Retry before Started because Retry subclasses Started;
+        elif isinstance(event, NodeRunRetryEvent):
+            self.retry_count += 1
+            self.logger.warning("🔄 Node retry: %s (attempt %s)", event.node_id, event.retry_index)
+            self.logger.warning("  Previous error: %s", event.error)
+
+        elif isinstance(event, NodeRunStartedEvent):
+            self.node_count += 1
+            self.logger.info('▶️ Node started: %s - "%s" (type: %s)', event.node_id, event.node_title, event.node_type)
+
+            if self.include_inputs and event.node_run_result.inputs:
+                self.logger.debug("  Inputs: %s", self._format_dict(event.node_run_result.inputs))
+
+        elif isinstance(event, NodeRunSucceededEvent):
+            self.success_count += 1
+            self.logger.info("✅ Node succeeded: %s", event.node_id)
+
+            if self.include_outputs and event.node_run_result.outputs:
+                self.logger.debug("  Outputs: %s", self._format_dict(event.node_run_result.outputs))
+
+            if self.include_process_data and event.node_run_result.process_data:
+                self.logger.debug("  Process data: %s", self._format_dict(event.node_run_result.process_data))
+
+        elif isinstance(event, NodeRunFailedEvent):
+            self.failure_count += 1
+            self.logger.error("❌ Node failed: %s", event.node_id)
+            self.logger.error("  Error: %s", event.error)
+
+            if event.node_run_result.error:
+                self.logger.error("  Details: %s", event.node_run_result.error)
+
+        elif isinstance(event, NodeRunExceptionEvent):
+            self.logger.warning("⚠️ Node exception handled: %s", event.node_id)
+            self.logger.warning("  Error: %s", event.error)
+
+        elif isinstance(event, NodeRunStreamChunkEvent):
+            # Log stream chunks at debug level to avoid spam
+            final_indicator = " (FINAL)" if event.is_final else ""
+            self.logger.debug(
+                "📝 Stream chunk from %s%s: %s", event.node_id, final_indicator, self._truncate_value(event.chunk)
+            )
+
+        # Iteration events
+        elif isinstance(event, NodeRunIterationStartedEvent):
+            self.logger.info("🔁 Iteration started: %s", event.node_id)
+
+        elif isinstance(event, NodeRunIterationNextEvent):
+            self.logger.debug("  Iteration next: %s (index: %s)", event.node_id, event.index)
+
+        elif isinstance(event, NodeRunIterationSucceededEvent):
+            self.logger.info("✅ Iteration succeeded: %s", event.node_id)
+            if self.include_outputs and event.outputs:
+                self.logger.debug("  Outputs: %s", self._format_dict(event.outputs))
+
+        elif isinstance(event, NodeRunIterationFailedEvent):
+            self.logger.error("❌ Iteration failed: %s", event.node_id)
+            self.logger.error("  Error: %s", event.error)
+
+        # Loop events
+        elif isinstance(event, NodeRunLoopStartedEvent):
+            self.logger.info("🔄 Loop started: %s", event.node_id)
+
+        elif isinstance(event, NodeRunLoopNextEvent):
+            self.logger.debug("  Loop iteration: %s (index: %s)", event.node_id, event.index)
+
+        elif isinstance(event, NodeRunLoopSucceededEvent):
+            self.logger.info("✅ Loop succeeded: %s", event.node_id)
+            if self.include_outputs and event.outputs:
+                self.logger.debug("  Outputs: %s", self._format_dict(event.outputs))
+
+        elif isinstance(event, NodeRunLoopFailedEvent):
+            self.logger.error("❌ Loop failed: %s", event.node_id)
+            self.logger.error("  Error: %s", event.error)
+
+        else:
+            # Log unknown events at debug level
+            self.logger.debug("Event: %s", event_class)
+
+    @override
+    def on_graph_end(self, error: Exception | None) -> None:
+        """Log graph execution end with summary statistics."""
+        self.logger.info("=" * 80)
+
+        if error:
+            self.logger.error("🔴 GRAPH EXECUTION FAILED")
+            self.logger.error("  Error: %s", error)
+        else:
+            self.logger.info("🎉 GRAPH EXECUTION COMPLETED SUCCESSFULLY")
+
+        # Log execution statistics
+        self.logger.info("Execution Statistics:")
+        self.logger.info("  Total nodes executed: %s", self.node_count)
+        self.logger.info("  Successful nodes: %s", self.success_count)
+        self.logger.info("  Failed nodes: %s", self.failure_count)
+        self.logger.info("  Node retries: %s", self.retry_count)
+
+        # Log final state if available
+        if self.include_outputs and self.graph_runtime_state.outputs:
+            self.logger.info("Final outputs: %s", self._format_dict(self.graph_runtime_state.outputs))
+
+        self.logger.info("=" * 80)
--- a/api/dify_graph/graph_engine/layers/execution_limits.py
+++ b/api/dify_graph/graph_engine/layers/execution_limits.py
@ -0,0 +1,150 @@
+"""
+Execution limits layer for GraphEngine.
+
+This layer monitors workflow execution to enforce limits on:
+- Maximum execution steps
+- Maximum execution time
+
+When limits are exceeded, the layer automatically aborts execution.
+"""
+
+import logging
+import time
+from enum import StrEnum
+from typing import final
+
+from typing_extensions import override
+
+from dify_graph.graph_engine.entities.commands import AbortCommand, CommandType
+from dify_graph.graph_engine.layers import GraphEngineLayer
+from dify_graph.graph_events import (
+    GraphEngineEvent,
+    NodeRunStartedEvent,
+)
+from dify_graph.graph_events.node import NodeRunFailedEvent, NodeRunSucceededEvent
+
+
+class LimitType(StrEnum):
+    """Types of execution limits that can be exceeded."""
+
+    STEP_LIMIT = "step_limit"
+    TIME_LIMIT = "time_limit"
+
+
+@final
+class ExecutionLimitsLayer(GraphEngineLayer):
+    """
+    Layer that enforces execution limits for workflows.
+
+    Monitors:
+    - Step count: Tracks number of node executions
+    - Time limit: Monitors total execution time
+
+    Automatically aborts execution when limits are exceeded.
+    """
+
+    def __init__(self, max_steps: int, max_time: int) -> None:
+        """
+        Initialize the execution limits layer.
+
+        Args:
+            max_steps: Maximum number of execution steps allowed
+            max_time: Maximum execution time in seconds allowed
+        """
+        super().__init__()
+        self.max_steps = max_steps
+        self.max_time = max_time
+
+        # Runtime tracking
+        self.start_time: float | None = None
+        self.step_count = 0
+        self.logger = logging.getLogger(__name__)
+
+        # State tracking
+        self._execution_started = False
+        self._execution_ended = False
+        self._abort_sent = False  # Track if abort command has been sent
+
+    @override
+    def on_graph_start(self) -> None:
+        """Called when graph execution starts."""
+        self.start_time = time.time()
+        self.step_count = 0
+        self._execution_started = True
+        self._execution_ended = False
+        self._abort_sent = False
+
+        self.logger.debug("Execution limits monitoring started")
+
+    @override
+    def on_event(self, event: GraphEngineEvent) -> None:
+        """
+        Called for every event emitted by the engine.
+
+        Monitors execution progress and enforces limits.
+        """
+        if not self._execution_started or self._execution_ended or self._abort_sent:
+            return
+
+        # Track step count for node execution events
+        if isinstance(event, NodeRunStartedEvent):
+            self.step_count += 1
+            self.logger.debug("Step %d started: %s", self.step_count, event.node_id)
+
+        # Check step limit when node execution completes
+        if isinstance(event, NodeRunSucceededEvent | NodeRunFailedEvent):
+            if self._reached_step_limitation():
+                self._send_abort_command(LimitType.STEP_LIMIT)
+
+            if self._reached_time_limitation():
+                self._send_abort_command(LimitType.TIME_LIMIT)
+
+    @override
+    def on_graph_end(self, error: Exception | None) -> None:
+        """Called when graph execution ends."""
+        if self._execution_started and not self._execution_ended:
+            self._execution_ended = True
+
+            if self.start_time:
+                total_time = time.time() - self.start_time
+                self.logger.debug("Execution completed: %d steps in %.2f seconds", self.step_count, total_time)
+
+    def _reached_step_limitation(self) -> bool:
+        """Check if step count limit has been exceeded."""
+        return self.step_count > self.max_steps
+
+    def _reached_time_limitation(self) -> bool:
+        """Check if time limit has been exceeded."""
+        return self.start_time is not None and (time.time() - self.start_time) > self.max_time
+
+    def _send_abort_command(self, limit_type: LimitType) -> None:
+        """
+        Send abort command due to limit violation.
+
+        Args:
+            limit_type: Type of limit exceeded
+        """
+        if not self.command_channel or not self._execution_started or self._execution_ended or self._abort_sent:
+            return
+
+        # Format detailed reason message
+        if limit_type == LimitType.STEP_LIMIT:
+            reason = f"Maximum execution steps exceeded: {self.step_count} > {self.max_steps}"
+        elif limit_type == LimitType.TIME_LIMIT:
+            elapsed_time = time.time() - self.start_time if self.start_time else 0
+            reason = f"Maximum execution time exceeded: {elapsed_time:.2f}s > {self.max_time}s"
+
+        self.logger.warning("Execution limit exceeded: %s", reason)
+
+        try:
+            # Send abort command to the engine
+            abort_command = AbortCommand(command_type=CommandType.ABORT, reason=reason)
+            self.command_channel.send_command(abort_command)
+
+            # Mark that abort has been sent to prevent duplicate commands
+            self._abort_sent = True
+
+            self.logger.debug("Abort command sent to engine")
+
+        except Exception:
+            self.logger.exception("Failed to send abort command")