evaluation runtime

2026-05-02 08:28:03 +08:00 · 2026-03-17 15:26:39 +08:00
parent 751c938d8a
commit f692def738
9 changed files with 258 additions and 182 deletions
--- a/api/core/evaluation/entities/evaluation_entity.py
+++ b/api/core/evaluation/entities/evaluation_entity.py
@ -15,6 +15,56 @@ class EvaluationCategory(StrEnum):
    RETRIEVAL_TEST = "retrieval_test"


+class EvaluationMetricName(StrEnum):
+    """Canonical metric names shared across all evaluation frameworks.
+
+    Each framework maps these names to its own internal implementation.
+    A framework that does not support a given metric should log a warning
+    and skip it rather than raising an error.
+    """
+
+    # LLM / general text-quality metrics
+    FAITHFULNESS = "faithfulness"
+    ANSWER_RELEVANCY = "answer_relevancy"
+    ANSWER_CORRECTNESS = "answer_correctness"
+    SEMANTIC_SIMILARITY = "semantic_similarity"
+
+    # Retrieval-quality metrics
+    CONTEXT_PRECISION = "context_precision"
+    CONTEXT_RECALL = "context_recall"
+    CONTEXT_RELEVANCE = "context_relevance"
+
+    # Agent-quality metrics
+    TOOL_CORRECTNESS = "tool_correctness"
+    TASK_COMPLETION = "task_completion"
+
+
+# Per-category canonical metric lists used by get_supported_metrics().
+LLM_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.FAITHFULNESS,
+    EvaluationMetricName.ANSWER_RELEVANCY,
+    EvaluationMetricName.ANSWER_CORRECTNESS,
+    EvaluationMetricName.SEMANTIC_SIMILARITY,
+]
+
+RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.CONTEXT_PRECISION,
+    EvaluationMetricName.CONTEXT_RECALL,
+    EvaluationMetricName.CONTEXT_RELEVANCE,
+]
+
+AGENT_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.TOOL_CORRECTNESS,
+    EvaluationMetricName.TASK_COMPLETION,
+]
+
+WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.FAITHFULNESS,
+    EvaluationMetricName.ANSWER_RELEVANCY,
+    EvaluationMetricName.ANSWER_CORRECTNESS,
+]
+
+
 class EvaluationMetric(BaseModel):
    name: str
    value: Any
--- a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py
+++ b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py
@ -4,30 +4,39 @@ from typing import Any
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.config_entity import DeepEvalConfig
 from core.evaluation.entities.evaluation_entity import (
+    AGENT_METRIC_NAMES,
+    LLM_METRIC_NAMES,
+    RETRIEVAL_METRIC_NAMES,
+    WORKFLOW_METRIC_NAMES,
    EvaluationCategory,
    EvaluationItemInput,
    EvaluationItemResult,
    EvaluationMetric,
+    EvaluationMetricName,
 )
 from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper

 logger = logging.getLogger(__name__)

-# Metric name mappings per category
-#
+# Maps canonical EvaluationMetricName to the corresponding deepeval metric class name.
 # deepeval metric field requirements (LLMTestCase fields):
-#   - faithfulness:          input, actual_output, retrieval_context
-#   - answer_relevancy:      input, actual_output
-#   - contextual_precision:  input, actual_output, expected_output, retrieval_context
-#   - contextual_recall:     input, actual_output, expected_output, retrieval_context
-#   - contextual_relevancy:  input, actual_output, retrieval_context
-#   - hallucination:         input, actual_output, context
-#   - tool_correctness:      input, actual_output, expected_tools
-#   - task_completion:       input, actual_output
-LLM_METRICS = ["faithfulness", "answer_relevancy"]
-RETRIEVAL_METRICS = ["contextual_precision", "contextual_recall", "contextual_relevancy"]
-AGENT_METRICS = ["tool_correctness", "task_completion"]
-WORKFLOW_METRICS = ["faithfulness", "answer_relevancy"]
+#   - faithfulness:       input, actual_output, retrieval_context
+#   - answer_relevancy:   input, actual_output
+#   - context_precision:  input, actual_output, expected_output, retrieval_context
+#   - context_recall:     input, actual_output, expected_output, retrieval_context
+#   - context_relevance:  input, actual_output, retrieval_context
+#   - tool_correctness:   input, actual_output, expected_tools
+#   - task_completion:    input, actual_output
+# Metrics not listed here are unsupported by deepeval and will be skipped.
+_DEEPEVAL_METRIC_MAP: dict[EvaluationMetricName, str] = {
+    EvaluationMetricName.FAITHFULNESS: "FaithfulnessMetric",
+    EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancyMetric",
+    EvaluationMetricName.CONTEXT_PRECISION: "ContextualPrecisionMetric",
+    EvaluationMetricName.CONTEXT_RECALL: "ContextualRecallMetric",
+    EvaluationMetricName.CONTEXT_RELEVANCE: "ContextualRelevancyMetric",
+    EvaluationMetricName.TOOL_CORRECTNESS: "ToolCorrectnessMetric",
+    EvaluationMetricName.TASK_COMPLETION: "TaskCompletionMetric",
+}


 class DeepEvalEvaluator(BaseEvaluationInstance):
@ -39,15 +48,16 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
    def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
        match category:
            case EvaluationCategory.LLM:
-                return LLM_METRICS
+                candidates = LLM_METRIC_NAMES
            case EvaluationCategory.RETRIEVAL:
-                return RETRIEVAL_METRICS
+                candidates = RETRIEVAL_METRIC_NAMES
            case EvaluationCategory.AGENT:
-                return AGENT_METRICS
-            case EvaluationCategory.WORKFLOW:
-                return WORKFLOW_METRICS
+                candidates = AGENT_METRIC_NAMES
+            case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
+                candidates = WORKFLOW_METRIC_NAMES
            case _:
                return []
+        return [m for m in candidates if m in _DEEPEVAL_METRIC_MAP]

    def evaluate_llm(
        self,
@ -121,8 +131,8 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
        - Retrieval: input=query, actual_output=output, expected_output, retrieval_context=context
        - Agent: input=query, actual_output=output
        """
-        deepeval_metrics = _build_deepeval_metrics(requested_metrics)
-        if not deepeval_metrics:
+        metric_pairs = _build_deepeval_metrics(requested_metrics)
+        if not metric_pairs:
            logger.warning("No valid DeepEval metrics found for: %s", requested_metrics)
            return [EvaluationItemResult(index=item.index) for item in items]

@ -130,15 +140,15 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
        for item in items:
            test_case = self._build_test_case(item, category)
            metrics: list[EvaluationMetric] = []
-            for metric in deepeval_metrics:
+            for canonical_name, metric in metric_pairs:
                try:
                    metric.measure(test_case)
                    if metric.score is not None:
-                        metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score)))
+                        metrics.append(EvaluationMetric(name=canonical_name, value=float(metric.score)))
                except Exception:
                    logger.exception(
                        "Failed to compute metric %s for item %d",
-                        metric.__class__.__name__,
+                        canonical_name,
                        item.index,
                    )
            results.append(EvaluationItemResult(index=item.index, metrics=metrics))
@ -248,8 +258,12 @@ def _format_input(inputs: dict[str, Any], category: EvaluationCategory) -> str:
            return str(next(iter(inputs.values()), "")) if inputs else ""


-def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
-    """Build DeepEval metric instances from metric names."""
+def _build_deepeval_metrics(requested_metrics: list[str]) -> list[tuple[str, Any]]:
+    """Build DeepEval metric instances from canonical metric names.
+
+    Returns a list of (canonical_name, metric_instance) pairs so that callers
+    can record the canonical name rather than the framework-internal class name.
+    """
    try:
        from deepeval.metrics import (
            AnswerRelevancyMetric,
@ -261,24 +275,25 @@ def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
            ToolCorrectnessMetric,
        )

-        metric_map: dict[str, Any] = {
-            "faithfulness": FaithfulnessMetric,
-            "answer_relevancy": AnswerRelevancyMetric,
-            "contextual_precision": ContextualPrecisionMetric,
-            "contextual_recall": ContextualRecallMetric,
-            "contextual_relevancy": ContextualRelevancyMetric,
-            "tool_correctness": ToolCorrectnessMetric,
-            "task_completion": TaskCompletionMetric,
+        # Maps canonical name → deepeval metric class
+        deepeval_class_map: dict[str, Any] = {
+            EvaluationMetricName.FAITHFULNESS: FaithfulnessMetric,
+            EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancyMetric,
+            EvaluationMetricName.CONTEXT_PRECISION: ContextualPrecisionMetric,
+            EvaluationMetricName.CONTEXT_RECALL: ContextualRecallMetric,
+            EvaluationMetricName.CONTEXT_RELEVANCE: ContextualRelevancyMetric,
+            EvaluationMetricName.TOOL_CORRECTNESS: ToolCorrectnessMetric,
+            EvaluationMetricName.TASK_COMPLETION: TaskCompletionMetric,
        }

-        metrics = []
+        pairs: list[tuple[str, Any]] = []
        for name in requested_metrics:
-            metric_class = metric_map.get(name)
+            metric_class = deepeval_class_map.get(name)
            if metric_class:
-                metrics.append(metric_class(threshold=0.5))
+                pairs.append((name, metric_class(threshold=0.5)))
            else:
-                logger.warning("Unknown DeepEval metric: %s", name)
-        return metrics
+                logger.warning("Metric '%s' is not supported by DeepEval, skipping", name)
+        return pairs
    except ImportError:
        logger.warning("DeepEval metrics not available")
        return []
--- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
+++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
@ -4,20 +4,32 @@ from typing import Any
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.config_entity import RagasConfig
 from core.evaluation.entities.evaluation_entity import (
+    AGENT_METRIC_NAMES,
+    LLM_METRIC_NAMES,
+    RETRIEVAL_METRIC_NAMES,
+    WORKFLOW_METRIC_NAMES,
    EvaluationCategory,
    EvaluationItemInput,
    EvaluationItemResult,
    EvaluationMetric,
+    EvaluationMetricName,
 )
 from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper

 logger = logging.getLogger(__name__)

-# Metric name mappings per category
-LLM_METRICS = ["faithfulness", "answer_relevancy", "answer_correctness", "semantic_similarity"]
-RETRIEVAL_METRICS = ["context_precision", "context_recall", "context_relevance"]
-AGENT_METRICS = ["tool_call_accuracy", "answer_correctness"]
-WORKFLOW_METRICS = ["faithfulness", "answer_correctness"]
+# Maps canonical EvaluationMetricName to the corresponding ragas metric class.
+# Metrics not listed here are unsupported by ragas and will be skipped.
+_RAGAS_METRIC_MAP: dict[EvaluationMetricName, str] = {
+    EvaluationMetricName.FAITHFULNESS: "Faithfulness",
+    EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancy",
+    EvaluationMetricName.ANSWER_CORRECTNESS: "AnswerCorrectness",
+    EvaluationMetricName.SEMANTIC_SIMILARITY: "SemanticSimilarity",
+    EvaluationMetricName.CONTEXT_PRECISION: "ContextPrecision",
+    EvaluationMetricName.CONTEXT_RECALL: "ContextRecall",
+    EvaluationMetricName.CONTEXT_RELEVANCE: "ContextRelevance",
+    EvaluationMetricName.TOOL_CORRECTNESS: "ToolCallAccuracy",
+}


 class RagasEvaluator(BaseEvaluationInstance):
@ -29,15 +41,16 @@ class RagasEvaluator(BaseEvaluationInstance):
    def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
        match category:
            case EvaluationCategory.LLM:
-                return LLM_METRICS
+                candidates = LLM_METRIC_NAMES
            case EvaluationCategory.RETRIEVAL:
-                return RETRIEVAL_METRICS
+                candidates = RETRIEVAL_METRIC_NAMES
            case EvaluationCategory.AGENT:
-                return AGENT_METRICS
-            case EvaluationCategory.WORKFLOW:
-                return WORKFLOW_METRICS
+                candidates = AGENT_METRIC_NAMES
+            case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
+                candidates = WORKFLOW_METRIC_NAMES
            case _:
                return []
+        return [m for m in candidates if m in _RAGAS_METRIC_MAP]

    def evaluate_llm(
        self,
@ -250,7 +263,7 @@ class RagasEvaluator(BaseEvaluationInstance):

    @staticmethod
    def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:
-        """Build RAGAS metric instances from metric names."""
+        """Build RAGAS metric instances from canonical metric names."""
        try:
            from ragas.metrics.collections import (
                AnswerCorrectness,
@ -263,24 +276,25 @@ class RagasEvaluator(BaseEvaluationInstance):
                ToolCallAccuracy,
            )

-            metric_map: dict[str, Any] = {
-                "faithfulness": Faithfulness,
-                "answer_relevancy": AnswerRelevancy,
-                "answer_correctness": AnswerCorrectness,
-                "semantic_similarity": SemanticSimilarity,
-                "context_precision": ContextPrecision,
-                "context_recall": ContextRecall,
-                "context_relevance": ContextRelevance,
-                "tool_call_accuracy": ToolCallAccuracy,
+            # Maps canonical name → ragas metric class
+            ragas_class_map: dict[str, Any] = {
+                EvaluationMetricName.FAITHFULNESS: Faithfulness,
+                EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancy,
+                EvaluationMetricName.ANSWER_CORRECTNESS: AnswerCorrectness,
+                EvaluationMetricName.SEMANTIC_SIMILARITY: SemanticSimilarity,
+                EvaluationMetricName.CONTEXT_PRECISION: ContextPrecision,
+                EvaluationMetricName.CONTEXT_RECALL: ContextRecall,
+                EvaluationMetricName.CONTEXT_RELEVANCE: ContextRelevance,
+                EvaluationMetricName.TOOL_CORRECTNESS: ToolCallAccuracy,
            }

            metrics = []
            for name in requested_metrics:
-                metric_class = metric_map.get(name)
+                metric_class = ragas_class_map.get(name)
                if metric_class:
                    metrics.append(metric_class())
                else:
-                    logger.warning("Unknown RAGAS metric: %s", name)
+                    logger.warning("Metric '%s' is not supported by RAGAS, skipping", name)
            return metrics
        except ImportError:
            logger.warning("RAGAS metrics not available")
--- a/api/core/evaluation/runners/agent_evaluation_runner.py
+++ b/api/core/evaluation/runners/agent_evaluation_runner.py
@ -78,44 +78,29 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Compute agent evaluation metrics."""
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_agent(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for agent evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_agent(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
        )

-        # Merge metrics back preserving metadata
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for agent evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_agent_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged

    @staticmethod
    def _extract_query(inputs: dict[str, Any]) -> str:
@ -157,3 +142,13 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
            logger.exception("Error consuming agent stream")

        return "".join(answer_parts), tool_calls
+
+
+def _extract_agent_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from agent NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
--- a/api/core/evaluation/runners/retrieval_evaluation_runner.py
+++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py
@ -63,7 +63,8 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):

    @staticmethod
    def _extract_query(inputs: dict[str, Any]) -> str:
-        for key in "query":
+        for key in ("query", "question", "input", "text"):
            if key in inputs:
                return str(inputs[key])
-        return ""
+        values = list(inputs.values())
+        return str(values[0]) if values else ""
--- a/api/core/evaluation/runners/snippet_evaluation_runner.py
+++ b/api/core/evaluation/runners/snippet_evaluation_runner.py
@ -109,44 +109,29 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
        Snippets are essentially workflows, so we reuse evaluate_workflow from
        the evaluation instance.
        """
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_workflow(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for snippet evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_workflow(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
        )

-        # Merge metrics back preserving metadata from Phase 1
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for snippet evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_snippet_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged

    @staticmethod
    def _extract_output(response: Mapping[str, Any]) -> str:
@ -235,3 +220,13 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
            "error": node.error,
            "elapsed_time": node.elapsed_time,
        }
+
+
+def _extract_snippet_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from snippet NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
--- a/api/core/evaluation/runners/workflow_evaluation_runner.py
+++ b/api/core/evaluation/runners/workflow_evaluation_runner.py
@ -34,44 +34,29 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Compute workflow evaluation metrics (end-to-end)."""
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_workflow(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for workflow evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_workflow(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
        )

-        # Merge metrics back preserving metadata
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for workflow evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_workflow_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged

    @staticmethod
    def _extract_output(response: Mapping[str, Any]) -> str:
@ -91,3 +76,13 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
        if isinstance(data, Mapping):
            return data.get("node_executions", [])
        return []
+
+
+def _extract_workflow_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from workflow NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
--- a/api/models/evaluation.py
+++ b/api/models/evaluation.py
@ -105,6 +105,7 @@ class EvaluationRun(Base):
    error: Mapped[str | None] = mapped_column(Text, nullable=True)

    celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
+    metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True)

    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
--- a/api/tasks/evaluation_task.py
+++ b/api/tasks/evaluation_task.py
@ -12,6 +12,7 @@ from configs import dify_config
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.evaluation_entity import (
    EvaluationCategory,
+    EvaluationDatasetInput,
    EvaluationItemResult,
    EvaluationRunData,
 )
@ -88,23 +89,23 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
    )

    results: list[EvaluationItemResult] = _execute_evaluation_runner(
-        session,
-        run_data,
-        evaluation_instance,
-        node_run_result_mapping_list,
+        session=session,
+        run_data=run_data,
+        evaluation_instance=evaluation_instance,
+        node_run_result_mapping_list=node_run_result_mapping_list,
    )

    # Compute summary metrics
    metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)

    # Generate result XLSX
-    result_xlsx = _generate_result_xlsx(run_data.items, results)
+    result_xlsx = _generate_result_xlsx(run_data.input_list, results)

    # Store result file
    result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session)

    # Update run to completed
-    evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
+    evaluation_run: EvaluationRun = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
    if evaluation_run:
        evaluation_run.status = EvaluationRunStatus.COMPLETED
        evaluation_run.completed_at = naive_utc_now()
@ -232,10 +233,10 @@ def _compute_metrics_summary(


 def _generate_result_xlsx(
-    items: list[Any],
+    input_list: list[EvaluationDatasetInput],
    results: list[EvaluationItemResult],
 ) -> bytes:
-    """Generate result XLSX with input data, actual output, and metric scores."""
+    """Generate result XLSX with input data, actual output, metric scores, and judgment."""
    wb = Workbook()
    ws = wb.active
    if ws is None:
@ -261,14 +262,18 @@ def _generate_result_xlsx(

    # Collect all input keys
    input_keys: list[str] = []
-    for item in items:
+    for item in input_list:
        for key in item.inputs:
            if key not in input_keys:
                input_keys.append(key)

+    # Include judgment column only when at least one result has judgment conditions evaluated
+    has_judgment = any(bool(r.judgment.condition_results) for r in results)
+
    # Build headers
+    judgment_headers = ["judgment"] if has_judgment else []
    headers = (
-        ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
+        ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + judgment_headers + ["error"]
    )

    # Write header row
@ -288,7 +293,7 @@ def _generate_result_xlsx(
    result_by_index = {r.index: r for r in results}

    # Write data rows
-    for row_idx, item in enumerate(items, start=2):
+    for row_idx, item in enumerate(input_list, start=2):
        result = result_by_index.get(item.index)

        col = 1
@ -317,9 +322,14 @@ def _generate_result_xlsx(
            ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
            col += 1

-        # Overall score
-        ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border
-        col += 1
+        # Judgment result
+        if has_judgment:
+            if result and result.judgment.condition_results:
+                judgment_value = "Pass" if result.judgment.passed else "Fail"
+            else:
+                judgment_value = ""
+            ws.cell(row=row_idx, column=col, value=judgment_value).border = thin_border
+            col += 1

        # Error
        ws.cell(row=row_idx, column=col, value=result.error if result else "").border = thin_border