fix(api): prevent node from running after pausing

2026-05-05 18:08:07 +08:00 · 2026-01-08 10:03:22 +08:00
parent 3c79bea28f
commit 2a6b6a873e
10 changed files with 787 additions and 12 deletions
--- a/api/core/workflow/graph_engine/event_management/event_handlers.py
+++ b/api/core/workflow/graph_engine/event_management/event_handlers.py
@ -195,9 +195,13 @@ class EventHandler:
            self._event_collector.collect(edge_event)

        # Enqueue ready nodes
-        for node_id in ready_nodes:
-            self._state_manager.enqueue_node(node_id)
-            self._state_manager.start_execution(node_id)
+        if self._graph_execution.is_paused:
+            for node_id in ready_nodes:
+                self._graph_runtime_state.register_deferred_node(node_id)
+        else:
+            for node_id in ready_nodes:
+                self._state_manager.enqueue_node(node_id)
+                self._state_manager.start_execution(node_id)

        # Update execution tracking
        self._state_manager.finish_execution(event.node_id)
--- a/api/core/workflow/graph_engine/graph_engine.py
+++ b/api/core/workflow/graph_engine/graph_engine.py
@ -317,8 +317,10 @@ class GraphEngine:
    def _start_execution(self, *, resume: bool = False) -> None:
        """Start execution subsystems."""
        paused_nodes: list[str] = []
+        deferred_nodes: list[str] = []
        if resume:
            paused_nodes = self._graph_runtime_state.consume_paused_nodes()
+            deferred_nodes = self._graph_runtime_state.consume_deferred_nodes()

        # Start worker pool (it calculates initial workers internally)
        self._worker_pool.start()
@ -334,7 +336,11 @@ class GraphEngine:
            self._state_manager.enqueue_node(root_node.id)
            self._state_manager.start_execution(root_node.id)
        else:
-            for node_id in paused_nodes:
+            seen_nodes: set[str] = set()
+            for node_id in paused_nodes + deferred_nodes:
+                if node_id in seen_nodes:
+                    continue
+                seen_nodes.add(node_id)
                self._state_manager.enqueue_node(node_id)
                self._state_manager.start_execution(node_id)

--- a/api/core/workflow/graph_engine/graph_state_manager.py
+++ b/api/core/workflow/graph_engine/graph_state_manager.py
@ -224,6 +224,8 @@ class GraphStateManager:
        Returns:
            Number of executing nodes
        """
+        # This count is a best-effort snapshot and can change concurrently.
+        # Only use it for pause-drain checks where scheduling is already frozen.
        with self._lock:
            return len(self._executing_nodes)

--- a/api/core/workflow/graph_engine/orchestration/dispatcher.py
+++ b/api/core/workflow/graph_engine/orchestration/dispatcher.py
@ -84,13 +84,16 @@ class Dispatcher:
        """Main dispatcher loop."""
        try:
            self._process_commands()
+            paused = False
            while not self._stop_event.is_set():
                if (
                    self._execution_coordinator.aborted
-                    or self._execution_coordinator.paused
                    or self._execution_coordinator.execution_complete
                ):
                    break
+                if self._execution_coordinator.paused:
+                    paused = True
+                    break

                self._execution_coordinator.check_scaling()
                try:
@ -102,13 +105,10 @@ class Dispatcher:
                    time.sleep(0.1)

            self._process_commands()
-            while True:
-                try:
-                    event = self._event_queue.get(block=False)
-                    self._event_handler.dispatch(event)
-                    self._event_queue.task_done()
-                except queue.Empty:
-                    break
+            if paused:
+                self._drain_events_until_idle()
+            else:
+                self._drain_event_queue()

        except Exception as e:
            logger.exception("Dispatcher error")
@ -123,3 +123,24 @@ class Dispatcher:
    def _process_commands(self, event: GraphNodeEventBase | None = None):
        if event is None or isinstance(event, self._COMMAND_TRIGGER_EVENTS):
            self._execution_coordinator.process_commands()
+
+    def _drain_event_queue(self) -> None:
+        while True:
+            try:
+                event = self._event_queue.get(block=False)
+                self._event_handler.dispatch(event)
+                self._event_queue.task_done()
+            except queue.Empty:
+                break
+
+    def _drain_events_until_idle(self) -> None:
+        while not self._stop_event.is_set():
+            try:
+                event = self._event_queue.get(timeout=0.1)
+                self._event_handler.dispatch(event)
+                self._event_queue.task_done()
+                self._process_commands(event)
+            except queue.Empty:
+                if not self._execution_coordinator.has_executing_nodes():
+                    break
+        self._drain_event_queue()
--- a/api/core/workflow/graph_engine/orchestration/execution_coordinator.py
+++ b/api/core/workflow/graph_engine/orchestration/execution_coordinator.py
@ -94,3 +94,11 @@ class ExecutionCoordinator:

        self._worker_pool.stop()
        self._state_manager.clear_executing()
+
+    def has_executing_nodes(self) -> bool:
+        """Return True if any nodes are currently marked as executing."""
+        # This check is only safe once execution has already paused.
+        # Before pause, executing state can change concurrently, which makes the result unreliable.
+        if not self._graph_execution.is_paused:
+            raise AssertionError("has_executing_nodes should only be called after execution is paused")
+        return self._state_manager.get_executing_count() > 0
--- a/api/core/workflow/runtime/graph_runtime_state.py
+++ b/api/core/workflow/runtime/graph_runtime_state.py
@ -129,6 +129,7 @@ class _GraphRuntimeStateSnapshot:
    graph_execution_dump: str | None
    response_coordinator_dump: str | None
    paused_nodes: tuple[str, ...]
+    deferred_nodes: tuple[str, ...]


 class GraphRuntimeState:
@ -177,6 +178,7 @@ class GraphRuntimeState:
        self._pending_response_coordinator_dump: str | None = None
        self._pending_graph_execution_workflow_id: str | None = None
        self._paused_nodes: set[str] = set()
+        self._deferred_nodes: set[str] = set()
        # Tracks nodes that are being resumed in the current execution cycle.
        # Populated when paused nodes are consumed during resume.
        self._resuming_nodes: set[str] = set()
@ -321,6 +323,7 @@ class GraphRuntimeState:
            "ready_queue": self.ready_queue.dumps(),
            "graph_execution": self.graph_execution.dumps(),
            "paused_nodes": list(self._paused_nodes),
+            "deferred_nodes": list(self._deferred_nodes),
        }

        if self._response_coordinator is not None and self._graph is not None:
@ -370,6 +373,23 @@ class GraphRuntimeState:
        self._resuming_nodes.update(nodes)
        return nodes

+    def register_deferred_node(self, node_id: str) -> None:
+        """Record a node that became ready during pause and should resume later."""
+
+        self._deferred_nodes.add(node_id)
+
+    def get_deferred_nodes(self) -> list[str]:
+        """Retrieve deferred nodes without mutating internal state."""
+
+        return list(self._deferred_nodes)
+
+    def consume_deferred_nodes(self) -> list[str]:
+        """Retrieve and clear deferred nodes awaiting resume."""
+
+        nodes = list(self._deferred_nodes)
+        self._deferred_nodes.clear()
+        return nodes
+
    def consume_resuming_node(self, node_id: str) -> bool:
        """
        Return True iff `node_id` is in the resuming set and remove it.
@ -440,6 +460,7 @@ class GraphRuntimeState:
        graph_execution_payload = payload.get("graph_execution")
        response_payload = payload.get("response_coordinator")
        paused_nodes_payload = payload.get("paused_nodes", [])
+        deferred_nodes_payload = payload.get("deferred_nodes", [])

        return _GraphRuntimeStateSnapshot(
            start_at=start_at,
@ -453,6 +474,7 @@ class GraphRuntimeState:
            graph_execution_dump=graph_execution_payload,
            response_coordinator_dump=response_payload,
            paused_nodes=tuple(map(str, paused_nodes_payload)),
+            deferred_nodes=tuple(map(str, deferred_nodes_payload)),
        )

    def _apply_snapshot(self, snapshot: _GraphRuntimeStateSnapshot) -> None:
@ -468,6 +490,7 @@ class GraphRuntimeState:
        self._restore_graph_execution(snapshot.graph_execution_dump)
        self._restore_response_coordinator(snapshot.response_coordinator_dump)
        self._paused_nodes = set(snapshot.paused_nodes)
+        self._deferred_nodes = set(snapshot.deferred_nodes)

    def _restore_ready_queue(self, payload: str | None) -> None:
        if payload is not None: