mirror of
https://github.com/langgenius/dify.git
synced 2026-05-05 18:08:07 +08:00
fix(api): prevent node from running after pausing
This commit is contained in:
@ -195,9 +195,13 @@ class EventHandler:
|
||||
self._event_collector.collect(edge_event)
|
||||
|
||||
# Enqueue ready nodes
|
||||
for node_id in ready_nodes:
|
||||
self._state_manager.enqueue_node(node_id)
|
||||
self._state_manager.start_execution(node_id)
|
||||
if self._graph_execution.is_paused:
|
||||
for node_id in ready_nodes:
|
||||
self._graph_runtime_state.register_deferred_node(node_id)
|
||||
else:
|
||||
for node_id in ready_nodes:
|
||||
self._state_manager.enqueue_node(node_id)
|
||||
self._state_manager.start_execution(node_id)
|
||||
|
||||
# Update execution tracking
|
||||
self._state_manager.finish_execution(event.node_id)
|
||||
|
||||
@ -317,8 +317,10 @@ class GraphEngine:
|
||||
def _start_execution(self, *, resume: bool = False) -> None:
|
||||
"""Start execution subsystems."""
|
||||
paused_nodes: list[str] = []
|
||||
deferred_nodes: list[str] = []
|
||||
if resume:
|
||||
paused_nodes = self._graph_runtime_state.consume_paused_nodes()
|
||||
deferred_nodes = self._graph_runtime_state.consume_deferred_nodes()
|
||||
|
||||
# Start worker pool (it calculates initial workers internally)
|
||||
self._worker_pool.start()
|
||||
@ -334,7 +336,11 @@ class GraphEngine:
|
||||
self._state_manager.enqueue_node(root_node.id)
|
||||
self._state_manager.start_execution(root_node.id)
|
||||
else:
|
||||
for node_id in paused_nodes:
|
||||
seen_nodes: set[str] = set()
|
||||
for node_id in paused_nodes + deferred_nodes:
|
||||
if node_id in seen_nodes:
|
||||
continue
|
||||
seen_nodes.add(node_id)
|
||||
self._state_manager.enqueue_node(node_id)
|
||||
self._state_manager.start_execution(node_id)
|
||||
|
||||
|
||||
@ -224,6 +224,8 @@ class GraphStateManager:
|
||||
Returns:
|
||||
Number of executing nodes
|
||||
"""
|
||||
# This count is a best-effort snapshot and can change concurrently.
|
||||
# Only use it for pause-drain checks where scheduling is already frozen.
|
||||
with self._lock:
|
||||
return len(self._executing_nodes)
|
||||
|
||||
|
||||
@ -84,13 +84,16 @@ class Dispatcher:
|
||||
"""Main dispatcher loop."""
|
||||
try:
|
||||
self._process_commands()
|
||||
paused = False
|
||||
while not self._stop_event.is_set():
|
||||
if (
|
||||
self._execution_coordinator.aborted
|
||||
or self._execution_coordinator.paused
|
||||
or self._execution_coordinator.execution_complete
|
||||
):
|
||||
break
|
||||
if self._execution_coordinator.paused:
|
||||
paused = True
|
||||
break
|
||||
|
||||
self._execution_coordinator.check_scaling()
|
||||
try:
|
||||
@ -102,13 +105,10 @@ class Dispatcher:
|
||||
time.sleep(0.1)
|
||||
|
||||
self._process_commands()
|
||||
while True:
|
||||
try:
|
||||
event = self._event_queue.get(block=False)
|
||||
self._event_handler.dispatch(event)
|
||||
self._event_queue.task_done()
|
||||
except queue.Empty:
|
||||
break
|
||||
if paused:
|
||||
self._drain_events_until_idle()
|
||||
else:
|
||||
self._drain_event_queue()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Dispatcher error")
|
||||
@ -123,3 +123,24 @@ class Dispatcher:
|
||||
def _process_commands(self, event: GraphNodeEventBase | None = None):
|
||||
if event is None or isinstance(event, self._COMMAND_TRIGGER_EVENTS):
|
||||
self._execution_coordinator.process_commands()
|
||||
|
||||
def _drain_event_queue(self) -> None:
|
||||
while True:
|
||||
try:
|
||||
event = self._event_queue.get(block=False)
|
||||
self._event_handler.dispatch(event)
|
||||
self._event_queue.task_done()
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
def _drain_events_until_idle(self) -> None:
|
||||
while not self._stop_event.is_set():
|
||||
try:
|
||||
event = self._event_queue.get(timeout=0.1)
|
||||
self._event_handler.dispatch(event)
|
||||
self._event_queue.task_done()
|
||||
self._process_commands(event)
|
||||
except queue.Empty:
|
||||
if not self._execution_coordinator.has_executing_nodes():
|
||||
break
|
||||
self._drain_event_queue()
|
||||
|
||||
@ -94,3 +94,11 @@ class ExecutionCoordinator:
|
||||
|
||||
self._worker_pool.stop()
|
||||
self._state_manager.clear_executing()
|
||||
|
||||
def has_executing_nodes(self) -> bool:
|
||||
"""Return True if any nodes are currently marked as executing."""
|
||||
# This check is only safe once execution has already paused.
|
||||
# Before pause, executing state can change concurrently, which makes the result unreliable.
|
||||
if not self._graph_execution.is_paused:
|
||||
raise AssertionError("has_executing_nodes should only be called after execution is paused")
|
||||
return self._state_manager.get_executing_count() > 0
|
||||
|
||||
@ -129,6 +129,7 @@ class _GraphRuntimeStateSnapshot:
|
||||
graph_execution_dump: str | None
|
||||
response_coordinator_dump: str | None
|
||||
paused_nodes: tuple[str, ...]
|
||||
deferred_nodes: tuple[str, ...]
|
||||
|
||||
|
||||
class GraphRuntimeState:
|
||||
@ -177,6 +178,7 @@ class GraphRuntimeState:
|
||||
self._pending_response_coordinator_dump: str | None = None
|
||||
self._pending_graph_execution_workflow_id: str | None = None
|
||||
self._paused_nodes: set[str] = set()
|
||||
self._deferred_nodes: set[str] = set()
|
||||
# Tracks nodes that are being resumed in the current execution cycle.
|
||||
# Populated when paused nodes are consumed during resume.
|
||||
self._resuming_nodes: set[str] = set()
|
||||
@ -321,6 +323,7 @@ class GraphRuntimeState:
|
||||
"ready_queue": self.ready_queue.dumps(),
|
||||
"graph_execution": self.graph_execution.dumps(),
|
||||
"paused_nodes": list(self._paused_nodes),
|
||||
"deferred_nodes": list(self._deferred_nodes),
|
||||
}
|
||||
|
||||
if self._response_coordinator is not None and self._graph is not None:
|
||||
@ -370,6 +373,23 @@ class GraphRuntimeState:
|
||||
self._resuming_nodes.update(nodes)
|
||||
return nodes
|
||||
|
||||
def register_deferred_node(self, node_id: str) -> None:
|
||||
"""Record a node that became ready during pause and should resume later."""
|
||||
|
||||
self._deferred_nodes.add(node_id)
|
||||
|
||||
def get_deferred_nodes(self) -> list[str]:
|
||||
"""Retrieve deferred nodes without mutating internal state."""
|
||||
|
||||
return list(self._deferred_nodes)
|
||||
|
||||
def consume_deferred_nodes(self) -> list[str]:
|
||||
"""Retrieve and clear deferred nodes awaiting resume."""
|
||||
|
||||
nodes = list(self._deferred_nodes)
|
||||
self._deferred_nodes.clear()
|
||||
return nodes
|
||||
|
||||
def consume_resuming_node(self, node_id: str) -> bool:
|
||||
"""
|
||||
Return True iff `node_id` is in the resuming set and remove it.
|
||||
@ -440,6 +460,7 @@ class GraphRuntimeState:
|
||||
graph_execution_payload = payload.get("graph_execution")
|
||||
response_payload = payload.get("response_coordinator")
|
||||
paused_nodes_payload = payload.get("paused_nodes", [])
|
||||
deferred_nodes_payload = payload.get("deferred_nodes", [])
|
||||
|
||||
return _GraphRuntimeStateSnapshot(
|
||||
start_at=start_at,
|
||||
@ -453,6 +474,7 @@ class GraphRuntimeState:
|
||||
graph_execution_dump=graph_execution_payload,
|
||||
response_coordinator_dump=response_payload,
|
||||
paused_nodes=tuple(map(str, paused_nodes_payload)),
|
||||
deferred_nodes=tuple(map(str, deferred_nodes_payload)),
|
||||
)
|
||||
|
||||
def _apply_snapshot(self, snapshot: _GraphRuntimeStateSnapshot) -> None:
|
||||
@ -468,6 +490,7 @@ class GraphRuntimeState:
|
||||
self._restore_graph_execution(snapshot.graph_execution_dump)
|
||||
self._restore_response_coordinator(snapshot.response_coordinator_dump)
|
||||
self._paused_nodes = set(snapshot.paused_nodes)
|
||||
self._deferred_nodes = set(snapshot.deferred_nodes)
|
||||
|
||||
def _restore_ready_queue(self, payload: str | None) -> None:
|
||||
if payload is not None:
|
||||
|
||||
Reference in New Issue
Block a user