Files
dify/api/services/workflow/node_output_inspector_service.py
zyssyz123 fb07b43107 feat(api): Node Output Inspector service + 3 REST endpoints (Stage 4 §8) (#36644)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-05-26 07:34:33 +00:00

713 lines
28 KiB
Python

"""Node Output Inspector service (Stage 4 §8).
PRD §Node Output Inspector renames every workflow "Variable" to a "Node Output"
and re-organizes the panel by **producer node** rather than consumer node. This
service backs the new REST surface
``/apps/{app_id}/workflows/draft/runs/{run_id}/node-outputs[/...]`` with three
read-only views:
* :meth:`snapshot_workflow_run` — every node + its declared outputs + per-output
status, for one debug workflow run.
* :meth:`node_detail` — the same shape filtered down to one node.
* :meth:`output_preview` — full payload for one output, with signed download
URL when the output references an upload file.
Design constraints baked into this version:
1. **No new tables** (§8.1). Topology comes from ``WorkflowRun.graph`` (the
graph snapshot taken at execution time so the view stays consistent even
if the draft was edited mid-run). Execution facts come from
``WorkflowNodeExecutionModel`` rows already produced by the workflow
runtime.
2. **Draft + published runs** (decision D-1 lifted 2026-05-26). The Inspector
accepts ``WorkflowRunTriggeredFrom.DEBUGGING`` (draft test runs) as well as
``APP_RUN`` / ``WEBHOOK`` / ``SCHEDULE`` / ``PLUGIN`` / ``RAG_PIPELINE_*``
triggers; the underlying graph + executions are uniform across all of them.
Cross-tenant / cross-app rows still 404 via the standard tenant/app scope.
3. **Declared outputs by node kind**:
* Agent v2 nodes resolve their declared list via
:class:`WorkflowAgentBindingResolver` (the binding owns the canonical
``DeclaredOutputConfig`` list and falls back to PRD defaults when empty).
* Other node kinds don't have a declared-output schema yet; we surface the
keys present in the execution payload as a best-effort list typed
``unknown`` so the panel can still render them.
4. **Per-output status** is derived from the metadata the agent_v2 stack
already records (``output_type_check`` and ``output_check`` blobs); falling
back to the node's overall status when those signals aren't present.
5. **SSE stream** (design §8.5) lives in
:mod:`controllers.console.app.workflow_node_output_inspector` alongside the
REST endpoints. The Inspector and the babysit chat SSE share the
``{event, data, id}`` envelope per decision D-5.
"""
from __future__ import annotations
import json
import logging
from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from datetime import datetime
from enum import StrEnum
from typing import Any
from pydantic import BaseModel, ConfigDict, Field
from sqlalchemy import select
from core.db.session_factory import session_factory
from core.workflow.nodes.agent_v2.binding_resolver import (
WorkflowAgentBindingError,
WorkflowAgentBindingResolver,
)
from core.workflow.nodes.agent_v2.runtime_request_builder import (
WorkflowAgentRuntimeRequestBuilder,
)
from graphon.enums import (
BuiltinNodeTypes,
WorkflowExecutionStatus,
WorkflowNodeExecutionStatus,
)
from graphon.file import helpers as file_helpers
from models import App
from models.agent_config_entities import DeclaredOutputConfig, DeclaredOutputType
from models.workflow import WorkflowNodeExecutionModel, WorkflowRun
logger = logging.getLogger(__name__)
# ──────────────────────────────────────────────────────────────────────────────
# Public dataclasses / enums (Pydantic — these go straight on the wire)
# ──────────────────────────────────────────────────────────────────────────────
class NodeOutputStatus(StrEnum):
"""Lifecycle status of a single declared output within a run."""
PENDING = "pending" # node not started
RUNNING = "running" # node running, output not ready yet
READY = "ready"
TYPE_CHECK_FAILED = "type_check_failed"
OUTPUT_CHECK_FAILED = "output_check_failed"
NOT_PRODUCED = "not_produced" # node succeeded but did not produce this declared output
FAILED = "failed" # node itself failed/exception/stopped
class NodeStatus(StrEnum):
"""Coarse node-level status used by Inspector to pick a banner."""
IDLE = "idle"
RUNNING = "running"
READY = "ready"
FAILED = "failed"
class CheckResultView(BaseModel):
"""``type_check`` / ``output_check`` per-output summary block."""
model_config = ConfigDict(extra="forbid")
passed: bool
reason: str | None = None
class NodeOutputView(BaseModel):
model_config = ConfigDict(extra="forbid")
name: str
type: DeclaredOutputType | None = None
status: NodeOutputStatus
value_preview: Any = None
type_check: CheckResultView | None = None
output_check: CheckResultView | None = None
retried: int = 0
class NodeOutputsView(BaseModel):
model_config = ConfigDict(extra="forbid")
node_id: str
node_kind: str
node_display_name: str
node_status: NodeStatus
node_started_at: datetime | None = None
node_completed_at: datetime | None = None
outputs: list[NodeOutputView] = Field(default_factory=list)
class WorkflowRunSnapshotView(BaseModel):
model_config = ConfigDict(extra="forbid")
workflow_run_id: str
workflow_run_status: WorkflowExecutionStatus
node_outputs: list[NodeOutputsView] = Field(default_factory=list)
class OutputPreviewView(BaseModel):
model_config = ConfigDict(extra="forbid")
node_id: str
output_name: str
type: DeclaredOutputType | None = None
status: NodeOutputStatus
value: Any = None # full value (with signed URL for file refs)
class NodeOutputInspectorError(Exception):
"""Raised when a request cannot be served (404-level conditions)."""
def __init__(self, code: str, message: str) -> None:
super().__init__(message)
self.code = code
# ──────────────────────────────────────────────────────────────────────────────
# Internal helpers — declared outputs per node
# ──────────────────────────────────────────────────────────────────────────────
@dataclass(frozen=True, slots=True)
class _ResolvedDeclaration:
"""Declared output the Inspector should expose, with a normalized type.
``inferred`` is ``True`` when the node kind has no declared-output schema
(we derived the list from the execution payload). The frontend can use
this to dim the type column.
"""
name: str
declared_type: DeclaredOutputType | None
inferred: bool
def _is_agent_v2_node(node: Mapping[str, Any]) -> bool:
"""A graph node is Agent v2 iff its ``data.type`` is the AGENT builtin
AND its ``data.version`` is ``"2"``.
``BuiltinNodeTypes.AGENT`` is a ``ClassVar[NodeType]`` (plain string), not
a StrEnum, so we compare against it directly without ``.value``.
"""
data = node.get("data") or {}
if not isinstance(data, Mapping):
return False
if data.get("type") != BuiltinNodeTypes.AGENT:
return False
return str(data.get("version", "")) == "2"
def _graph_nodes(workflow_run: WorkflowRun) -> list[Mapping[str, Any]]:
"""Parse ``WorkflowRun.graph`` (LongText JSON) into a node list.
The graph blob may be missing / unparseable for very old runs; we treat
that as "no nodes" rather than failing the Inspector, so the panel still
renders an empty state.
"""
if not workflow_run.graph:
return []
try:
parsed = json.loads(workflow_run.graph)
except (json.JSONDecodeError, TypeError):
logger.warning("NodeOutputInspector: workflow_run %s has unparseable graph blob", workflow_run.id)
return []
if not isinstance(parsed, Mapping):
return []
nodes = parsed.get("nodes")
if not isinstance(nodes, list):
return []
return [n for n in nodes if isinstance(n, Mapping) and "id" in n]
# ──────────────────────────────────────────────────────────────────────────────
# Internal helpers — per-output status derivation
# ──────────────────────────────────────────────────────────────────────────────
def _decode_json_blob(blob: str | None) -> Mapping[str, Any] | None:
if not blob:
return None
try:
decoded = json.loads(blob)
except (json.JSONDecodeError, TypeError):
return None
if not isinstance(decoded, Mapping):
return None
return decoded
def _node_status_for(execution: WorkflowNodeExecutionModel | None) -> NodeStatus:
if execution is None:
return NodeStatus.IDLE
if execution.status == WorkflowNodeExecutionStatus.RUNNING:
return NodeStatus.RUNNING
if execution.status == WorkflowNodeExecutionStatus.SUCCEEDED:
return NodeStatus.READY
return NodeStatus.FAILED
def _type_check_by_name(metadata: Mapping[str, Any] | None) -> dict[str, Mapping[str, Any]]:
if not metadata:
return {}
block = metadata.get("output_type_check")
if not isinstance(block, Mapping):
return {}
results = block.get("results") or []
if not isinstance(results, list):
return {}
indexed: dict[str, Mapping[str, Any]] = {}
for r in results:
if isinstance(r, Mapping) and isinstance(r.get("name"), str):
indexed[r["name"]] = r
return indexed
def _output_check_by_name(metadata: Mapping[str, Any] | None) -> dict[str, Mapping[str, Any]]:
if not metadata:
return {}
block = metadata.get("output_check")
if not isinstance(block, Mapping):
return {}
results = block.get("results") or []
if not isinstance(results, list):
return {}
indexed: dict[str, Mapping[str, Any]] = {}
for r in results:
if isinstance(r, Mapping) and isinstance(r.get("name"), str):
indexed[r["name"]] = r
return indexed
def _retried_attempt_count(metadata: Mapping[str, Any] | None) -> int:
"""The agent_v2 runtime records the final attempt index in metadata.
``attempt`` is 0-indexed so a single execution with no retry has
``attempt == 0`` and a Inspector-friendly ``retried == 0``.
"""
if not metadata:
return 0
attempt = metadata.get("attempt")
if isinstance(attempt, int) and attempt > 0:
return attempt
return 0
# ──────────────────────────────────────────────────────────────────────────────
# Value preview (file refs get signed URLs)
# ──────────────────────────────────────────────────────────────────────────────
_PREVIEW_TEXT_LIMIT = 500
_FILE_ID_KEYS: tuple[str, ...] = ("file_id", "upload_file_id", "tool_file_id")
def _looks_like_file_ref(value: Any) -> str | None:
"""Return the resolved ``file_id`` when ``value`` is a file-shaped dict."""
if not isinstance(value, Mapping):
return None
for key in _FILE_ID_KEYS:
candidate = value.get(key)
if isinstance(candidate, str) and candidate:
return candidate
return None
def _value_preview(value: Any) -> Any:
"""Compact preview suitable for the snapshot endpoint.
File refs are augmented with a signed download URL so the panel can render
a thumbnail / link without a second round-trip; long strings are truncated;
other scalar / dict / list shapes are returned as-is (the Pydantic layer
enforces JSON-safety on serialization).
"""
file_id = _looks_like_file_ref(value)
if file_id:
assert isinstance(value, Mapping)
try:
preview_url = file_helpers.get_signed_file_url(upload_file_id=file_id)
except Exception:
logger.warning("NodeOutputInspector: signed URL failed for file_id=%s", file_id, exc_info=True)
preview_url = None
return {**dict(value), "preview_url": preview_url}
if isinstance(value, str) and len(value) > _PREVIEW_TEXT_LIMIT:
return value[:_PREVIEW_TEXT_LIMIT] + ""
return value
def _full_value(value: Any) -> Any:
"""Same shape as :func:`_value_preview` minus the truncation."""
file_id = _looks_like_file_ref(value)
if file_id:
assert isinstance(value, Mapping)
try:
preview_url = file_helpers.get_signed_file_url(upload_file_id=file_id)
except Exception:
logger.warning("NodeOutputInspector: signed URL failed for file_id=%s", file_id, exc_info=True)
preview_url = None
return {**dict(value), "preview_url": preview_url}
return value
# ──────────────────────────────────────────────────────────────────────────────
# Service
# ──────────────────────────────────────────────────────────────────────────────
class NodeOutputInspectorService:
"""Read-only Inspector for draft + published workflow runs.
The service is dependency-light: it holds a single
:class:`WorkflowAgentBindingResolver` so agent v2 nodes can map to their
declared outputs without re-implementing binding lookup. All other I/O
uses the global session factory so workflow runs / executions stay on the
repo-default code path.
Tenancy is enforced via ``app_model.tenant_id`` + ``app_model.id`` on
every load — the same scope guard regardless of trigger source.
"""
def __init__(self, binding_resolver: WorkflowAgentBindingResolver | None = None) -> None:
self._binding_resolver = binding_resolver or WorkflowAgentBindingResolver()
# ── public API ────────────────────────────────────────────────────────
def snapshot_workflow_run(self, *, app_model: App, workflow_run_id: str) -> WorkflowRunSnapshotView:
"""Build the per-node snapshot for one debug workflow run."""
workflow_run, executions = self._load_run_and_executions(app_model=app_model, workflow_run_id=workflow_run_id)
executions_by_node = self._index_executions_by_node(executions)
graph_nodes = _graph_nodes(workflow_run)
node_views: list[NodeOutputsView] = []
for raw_node in graph_nodes:
node_id = str(raw_node["id"])
execution = executions_by_node.get(node_id)
view = self._build_node_view(
tenant_id=app_model.tenant_id,
app_id=app_model.id,
workflow_id=workflow_run.workflow_id,
raw_node=raw_node,
execution=execution,
)
node_views.append(view)
return WorkflowRunSnapshotView(
workflow_run_id=workflow_run.id,
workflow_run_status=workflow_run.status,
node_outputs=node_views,
)
def node_detail(self, *, app_model: App, workflow_run_id: str, node_id: str) -> NodeOutputsView:
"""Per-node Inspector entry — returns one ``NodeOutputsView``."""
workflow_run, executions = self._load_run_and_executions(app_model=app_model, workflow_run_id=workflow_run_id)
graph_nodes = _graph_nodes(workflow_run)
raw_node = next((n for n in graph_nodes if str(n.get("id")) == node_id), None)
if raw_node is None:
raise NodeOutputInspectorError(
"node_not_in_workflow_run",
f"Node {node_id!r} does not appear in workflow run {workflow_run_id!r}.",
)
execution = self._index_executions_by_node(executions).get(node_id)
return self._build_node_view(
tenant_id=app_model.tenant_id,
app_id=app_model.id,
workflow_id=workflow_run.workflow_id,
raw_node=raw_node,
execution=execution,
)
def output_preview(
self,
*,
app_model: App,
workflow_run_id: str,
node_id: str,
output_name: str,
) -> OutputPreviewView:
"""Full payload for one declared output (with signed file URL)."""
detail = self.node_detail(
app_model=app_model,
workflow_run_id=workflow_run_id,
node_id=node_id,
)
view = next((o for o in detail.outputs if o.name == output_name), None)
if view is None:
raise NodeOutputInspectorError(
"node_output_not_declared",
f"Output {output_name!r} is not declared on node {node_id!r}.",
)
# ``node_detail`` already produced a truncated value_preview; reload
# the raw value from the execution payload so the preview endpoint can
# return the full thing (still wrapped through ``_full_value`` for
# signed file URLs).
execution = self._index_executions_by_node(
self._load_run_and_executions(app_model=app_model, workflow_run_id=workflow_run_id)[1]
).get(node_id)
full_value: Any = None
if execution is not None:
outputs = _decode_json_blob(execution.outputs) or {}
if output_name in outputs:
full_value = _full_value(outputs[output_name])
return OutputPreviewView(
node_id=node_id,
output_name=output_name,
type=view.type,
status=view.status,
value=full_value,
)
# ── DB loading ────────────────────────────────────────────────────────
def _load_run_and_executions(
self, *, app_model: App, workflow_run_id: str
) -> tuple[WorkflowRun, Sequence[WorkflowNodeExecutionModel]]:
"""Fetch the ``WorkflowRun`` row + every execution that belongs to it.
Enforces:
* row exists,
* row belongs to the app's tenant + app.
The trigger source (DEBUGGING vs. APP_RUN / WEBHOOK / SCHEDULE / ...) is
deliberately not checked here — D-1 was lifted 2026-05-26 and the
Inspector now serves both draft and published runs.
"""
with session_factory.create_session() as session:
workflow_run = session.scalar(
select(WorkflowRun).where(
WorkflowRun.id == workflow_run_id,
WorkflowRun.app_id == app_model.id,
WorkflowRun.tenant_id == app_model.tenant_id,
)
)
if workflow_run is None:
raise NodeOutputInspectorError("workflow_run_not_found", "Workflow run not found.")
executions = session.scalars(
select(WorkflowNodeExecutionModel).where(
WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id,
WorkflowNodeExecutionModel.tenant_id == app_model.tenant_id,
WorkflowNodeExecutionModel.app_id == app_model.id,
)
).all()
return workflow_run, executions
@staticmethod
def _index_executions_by_node(
executions: Sequence[WorkflowNodeExecutionModel],
) -> dict[str, WorkflowNodeExecutionModel]:
"""Keep the latest execution per ``node_id``.
A given node may have multiple rows when retries or iterations occur;
``index`` is the per-run sequence counter, so we keep the one with
the highest index as the canonical "current" view.
"""
latest: dict[str, WorkflowNodeExecutionModel] = {}
for execution in executions:
existing = latest.get(execution.node_id)
if existing is None or execution.index > existing.index:
latest[execution.node_id] = execution
return latest
# ── Per-node view construction ────────────────────────────────────────
def _build_node_view(
self,
*,
tenant_id: str,
app_id: str,
workflow_id: str,
raw_node: Mapping[str, Any],
execution: WorkflowNodeExecutionModel | None,
) -> NodeOutputsView:
node_id = str(raw_node["id"])
data = raw_node.get("data") or {}
if not isinstance(data, Mapping):
data = {}
node_kind = str(data.get("type") or (execution.node_type if execution else "") or "unknown")
display_name = str(data.get("title") or (execution.title if execution else node_id))
node_status = _node_status_for(execution)
declarations = self._resolve_declared_outputs(
tenant_id=tenant_id,
app_id=app_id,
workflow_id=workflow_id,
node_id=node_id,
raw_node=raw_node,
execution=execution,
)
outputs_dict = _decode_json_blob(execution.outputs) if execution else None
metadata_dict = _decode_json_blob(execution.execution_metadata) if execution else None
type_check_by_name = _type_check_by_name(metadata_dict)
output_check_by_name = _output_check_by_name(metadata_dict)
retried = _retried_attempt_count(metadata_dict)
output_views: list[NodeOutputView] = []
for declaration in declarations:
output_views.append(
self._build_output_view(
declaration=declaration,
node_status=node_status,
outputs_dict=outputs_dict,
type_check_by_name=type_check_by_name,
output_check_by_name=output_check_by_name,
retried=retried,
)
)
return NodeOutputsView(
node_id=node_id,
node_kind=node_kind,
node_display_name=display_name,
node_status=node_status,
node_started_at=execution.created_at if execution else None,
node_completed_at=execution.finished_at if execution else None,
outputs=output_views,
)
def _build_output_view(
self,
*,
declaration: _ResolvedDeclaration,
node_status: NodeStatus,
outputs_dict: Mapping[str, Any] | None,
type_check_by_name: Mapping[str, Mapping[str, Any]],
output_check_by_name: Mapping[str, Mapping[str, Any]],
retried: int,
) -> NodeOutputView:
name = declaration.name
declared_type = declaration.declared_type
if node_status == NodeStatus.IDLE:
return NodeOutputView(
name=name,
type=declared_type,
status=NodeOutputStatus.PENDING,
retried=retried,
)
if node_status == NodeStatus.RUNNING:
return NodeOutputView(
name=name,
type=declared_type,
status=NodeOutputStatus.RUNNING,
retried=retried,
)
if node_status == NodeStatus.FAILED:
return NodeOutputView(
name=name,
type=declared_type,
status=NodeOutputStatus.FAILED,
retried=retried,
)
# ── node succeeded ────────────────────────────────────────────
type_check_result = type_check_by_name.get(name)
output_check_result = output_check_by_name.get(name)
type_check_view = self._coerce_check_view(type_check_result)
output_check_view = self._coerce_check_view(output_check_result)
# type check loses first; output check next; otherwise ready.
status: NodeOutputStatus
if type_check_result and not _is_passing(type_check_result):
status = NodeOutputStatus.TYPE_CHECK_FAILED
elif output_check_result and not _is_passing(output_check_result):
status = NodeOutputStatus.OUTPUT_CHECK_FAILED
elif outputs_dict is not None and name not in outputs_dict:
status = NodeOutputStatus.NOT_PRODUCED
else:
status = NodeOutputStatus.READY
value_preview = _value_preview(outputs_dict.get(name)) if outputs_dict and name in outputs_dict else None
return NodeOutputView(
name=name,
type=declared_type,
status=status,
value_preview=value_preview,
type_check=type_check_view,
output_check=output_check_view,
retried=retried,
)
@staticmethod
def _coerce_check_view(result: Mapping[str, Any] | None) -> CheckResultView | None:
if not result:
return None
# type_check rows use ``status``; output_check rows use ``status`` too —
# both record per-output state. We treat ``status == "ready"``/"passed"
# as passing and everything else as failing, so the view stays
# stable regardless of which producer wrote the metadata.
return CheckResultView(passed=_is_passing(result), reason=result.get("reason"))
# ── Declared-output resolution ────────────────────────────────────────
def _resolve_declared_outputs(
self,
*,
tenant_id: str,
app_id: str,
workflow_id: str,
node_id: str,
raw_node: Mapping[str, Any],
execution: WorkflowNodeExecutionModel | None,
) -> list[_ResolvedDeclaration]:
if _is_agent_v2_node(raw_node):
agent_decl = self._declared_outputs_for_agent_v2(
tenant_id=tenant_id,
app_id=app_id,
workflow_id=workflow_id,
node_id=node_id,
)
if agent_decl is not None:
return [_ResolvedDeclaration(name=o.name, declared_type=o.type, inferred=False) for o in agent_decl]
# Non-agent (or agent-binding-missing) fall back to inferring from the
# produced payload so the Inspector still has something to show.
return self._infer_outputs_from_payload(execution=execution)
def _declared_outputs_for_agent_v2(
self,
*,
tenant_id: str,
app_id: str,
workflow_id: str,
node_id: str,
) -> list[DeclaredOutputConfig] | None:
try:
bundle = self._binding_resolver.resolve(
tenant_id=tenant_id,
app_id=app_id,
workflow_id=workflow_id,
node_id=node_id,
)
except WorkflowAgentBindingError:
return None
try:
from models.agent_config_entities import WorkflowNodeJobConfig
node_job = WorkflowNodeJobConfig.model_validate(bundle.binding.node_job_config_dict)
except Exception:
logger.warning(
"NodeOutputInspector: malformed node_job_config for binding %s", bundle.binding.id, exc_info=True
)
return None
return list(WorkflowAgentRuntimeRequestBuilder.effective_declared_outputs(list(node_job.declared_outputs)))
@staticmethod
def _infer_outputs_from_payload(*, execution: WorkflowNodeExecutionModel | None) -> list[_ResolvedDeclaration]:
if execution is None:
return []
outputs = _decode_json_blob(execution.outputs)
if not outputs:
return []
return [_ResolvedDeclaration(name=name, declared_type=None, inferred=True) for name in outputs]
def _is_passing(result: Mapping[str, Any]) -> bool:
"""A check-result row is "passing" when its ``status`` is the ready/passed
sentinel emitted by the type-checker / output-check executor."""
status = result.get("status")
if status in {"ready", "passed", "not_produced"}:
return True
return False