mirror of
https://github.com/langgenius/dify.git
synced 2026-05-02 08:28:03 +08:00
evaluation runtime
This commit is contained in:
@ -15,6 +15,56 @@ class EvaluationCategory(StrEnum):
|
||||
RETRIEVAL_TEST = "retrieval_test"
|
||||
|
||||
|
||||
class EvaluationMetricName(StrEnum):
|
||||
"""Canonical metric names shared across all evaluation frameworks.
|
||||
|
||||
Each framework maps these names to its own internal implementation.
|
||||
A framework that does not support a given metric should log a warning
|
||||
and skip it rather than raising an error.
|
||||
"""
|
||||
|
||||
# LLM / general text-quality metrics
|
||||
FAITHFULNESS = "faithfulness"
|
||||
ANSWER_RELEVANCY = "answer_relevancy"
|
||||
ANSWER_CORRECTNESS = "answer_correctness"
|
||||
SEMANTIC_SIMILARITY = "semantic_similarity"
|
||||
|
||||
# Retrieval-quality metrics
|
||||
CONTEXT_PRECISION = "context_precision"
|
||||
CONTEXT_RECALL = "context_recall"
|
||||
CONTEXT_RELEVANCE = "context_relevance"
|
||||
|
||||
# Agent-quality metrics
|
||||
TOOL_CORRECTNESS = "tool_correctness"
|
||||
TASK_COMPLETION = "task_completion"
|
||||
|
||||
|
||||
# Per-category canonical metric lists used by get_supported_metrics().
|
||||
LLM_METRIC_NAMES: list[EvaluationMetricName] = [
|
||||
EvaluationMetricName.FAITHFULNESS,
|
||||
EvaluationMetricName.ANSWER_RELEVANCY,
|
||||
EvaluationMetricName.ANSWER_CORRECTNESS,
|
||||
EvaluationMetricName.SEMANTIC_SIMILARITY,
|
||||
]
|
||||
|
||||
RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [
|
||||
EvaluationMetricName.CONTEXT_PRECISION,
|
||||
EvaluationMetricName.CONTEXT_RECALL,
|
||||
EvaluationMetricName.CONTEXT_RELEVANCE,
|
||||
]
|
||||
|
||||
AGENT_METRIC_NAMES: list[EvaluationMetricName] = [
|
||||
EvaluationMetricName.TOOL_CORRECTNESS,
|
||||
EvaluationMetricName.TASK_COMPLETION,
|
||||
]
|
||||
|
||||
WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [
|
||||
EvaluationMetricName.FAITHFULNESS,
|
||||
EvaluationMetricName.ANSWER_RELEVANCY,
|
||||
EvaluationMetricName.ANSWER_CORRECTNESS,
|
||||
]
|
||||
|
||||
|
||||
class EvaluationMetric(BaseModel):
|
||||
name: str
|
||||
value: Any
|
||||
|
||||
@ -4,30 +4,39 @@ from typing import Any
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.config_entity import DeepEvalConfig
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
AGENT_METRIC_NAMES,
|
||||
LLM_METRIC_NAMES,
|
||||
RETRIEVAL_METRIC_NAMES,
|
||||
WORKFLOW_METRIC_NAMES,
|
||||
EvaluationCategory,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
EvaluationMetric,
|
||||
EvaluationMetricName,
|
||||
)
|
||||
from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Metric name mappings per category
|
||||
#
|
||||
# Maps canonical EvaluationMetricName to the corresponding deepeval metric class name.
|
||||
# deepeval metric field requirements (LLMTestCase fields):
|
||||
# - faithfulness: input, actual_output, retrieval_context
|
||||
# - answer_relevancy: input, actual_output
|
||||
# - contextual_precision: input, actual_output, expected_output, retrieval_context
|
||||
# - contextual_recall: input, actual_output, expected_output, retrieval_context
|
||||
# - contextual_relevancy: input, actual_output, retrieval_context
|
||||
# - hallucination: input, actual_output, context
|
||||
# - tool_correctness: input, actual_output, expected_tools
|
||||
# - task_completion: input, actual_output
|
||||
LLM_METRICS = ["faithfulness", "answer_relevancy"]
|
||||
RETRIEVAL_METRICS = ["contextual_precision", "contextual_recall", "contextual_relevancy"]
|
||||
AGENT_METRICS = ["tool_correctness", "task_completion"]
|
||||
WORKFLOW_METRICS = ["faithfulness", "answer_relevancy"]
|
||||
# - faithfulness: input, actual_output, retrieval_context
|
||||
# - answer_relevancy: input, actual_output
|
||||
# - context_precision: input, actual_output, expected_output, retrieval_context
|
||||
# - context_recall: input, actual_output, expected_output, retrieval_context
|
||||
# - context_relevance: input, actual_output, retrieval_context
|
||||
# - tool_correctness: input, actual_output, expected_tools
|
||||
# - task_completion: input, actual_output
|
||||
# Metrics not listed here are unsupported by deepeval and will be skipped.
|
||||
_DEEPEVAL_METRIC_MAP: dict[EvaluationMetricName, str] = {
|
||||
EvaluationMetricName.FAITHFULNESS: "FaithfulnessMetric",
|
||||
EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancyMetric",
|
||||
EvaluationMetricName.CONTEXT_PRECISION: "ContextualPrecisionMetric",
|
||||
EvaluationMetricName.CONTEXT_RECALL: "ContextualRecallMetric",
|
||||
EvaluationMetricName.CONTEXT_RELEVANCE: "ContextualRelevancyMetric",
|
||||
EvaluationMetricName.TOOL_CORRECTNESS: "ToolCorrectnessMetric",
|
||||
EvaluationMetricName.TASK_COMPLETION: "TaskCompletionMetric",
|
||||
}
|
||||
|
||||
|
||||
class DeepEvalEvaluator(BaseEvaluationInstance):
|
||||
@ -39,15 +48,16 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
|
||||
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
|
||||
match category:
|
||||
case EvaluationCategory.LLM:
|
||||
return LLM_METRICS
|
||||
candidates = LLM_METRIC_NAMES
|
||||
case EvaluationCategory.RETRIEVAL:
|
||||
return RETRIEVAL_METRICS
|
||||
candidates = RETRIEVAL_METRIC_NAMES
|
||||
case EvaluationCategory.AGENT:
|
||||
return AGENT_METRICS
|
||||
case EvaluationCategory.WORKFLOW:
|
||||
return WORKFLOW_METRICS
|
||||
candidates = AGENT_METRIC_NAMES
|
||||
case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
|
||||
candidates = WORKFLOW_METRIC_NAMES
|
||||
case _:
|
||||
return []
|
||||
return [m for m in candidates if m in _DEEPEVAL_METRIC_MAP]
|
||||
|
||||
def evaluate_llm(
|
||||
self,
|
||||
@ -121,8 +131,8 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
|
||||
- Retrieval: input=query, actual_output=output, expected_output, retrieval_context=context
|
||||
- Agent: input=query, actual_output=output
|
||||
"""
|
||||
deepeval_metrics = _build_deepeval_metrics(requested_metrics)
|
||||
if not deepeval_metrics:
|
||||
metric_pairs = _build_deepeval_metrics(requested_metrics)
|
||||
if not metric_pairs:
|
||||
logger.warning("No valid DeepEval metrics found for: %s", requested_metrics)
|
||||
return [EvaluationItemResult(index=item.index) for item in items]
|
||||
|
||||
@ -130,15 +140,15 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
|
||||
for item in items:
|
||||
test_case = self._build_test_case(item, category)
|
||||
metrics: list[EvaluationMetric] = []
|
||||
for metric in deepeval_metrics:
|
||||
for canonical_name, metric in metric_pairs:
|
||||
try:
|
||||
metric.measure(test_case)
|
||||
if metric.score is not None:
|
||||
metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score)))
|
||||
metrics.append(EvaluationMetric(name=canonical_name, value=float(metric.score)))
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Failed to compute metric %s for item %d",
|
||||
metric.__class__.__name__,
|
||||
canonical_name,
|
||||
item.index,
|
||||
)
|
||||
results.append(EvaluationItemResult(index=item.index, metrics=metrics))
|
||||
@ -248,8 +258,12 @@ def _format_input(inputs: dict[str, Any], category: EvaluationCategory) -> str:
|
||||
return str(next(iter(inputs.values()), "")) if inputs else ""
|
||||
|
||||
|
||||
def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
|
||||
"""Build DeepEval metric instances from metric names."""
|
||||
def _build_deepeval_metrics(requested_metrics: list[str]) -> list[tuple[str, Any]]:
|
||||
"""Build DeepEval metric instances from canonical metric names.
|
||||
|
||||
Returns a list of (canonical_name, metric_instance) pairs so that callers
|
||||
can record the canonical name rather than the framework-internal class name.
|
||||
"""
|
||||
try:
|
||||
from deepeval.metrics import (
|
||||
AnswerRelevancyMetric,
|
||||
@ -261,24 +275,25 @@ def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
|
||||
ToolCorrectnessMetric,
|
||||
)
|
||||
|
||||
metric_map: dict[str, Any] = {
|
||||
"faithfulness": FaithfulnessMetric,
|
||||
"answer_relevancy": AnswerRelevancyMetric,
|
||||
"contextual_precision": ContextualPrecisionMetric,
|
||||
"contextual_recall": ContextualRecallMetric,
|
||||
"contextual_relevancy": ContextualRelevancyMetric,
|
||||
"tool_correctness": ToolCorrectnessMetric,
|
||||
"task_completion": TaskCompletionMetric,
|
||||
# Maps canonical name → deepeval metric class
|
||||
deepeval_class_map: dict[str, Any] = {
|
||||
EvaluationMetricName.FAITHFULNESS: FaithfulnessMetric,
|
||||
EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancyMetric,
|
||||
EvaluationMetricName.CONTEXT_PRECISION: ContextualPrecisionMetric,
|
||||
EvaluationMetricName.CONTEXT_RECALL: ContextualRecallMetric,
|
||||
EvaluationMetricName.CONTEXT_RELEVANCE: ContextualRelevancyMetric,
|
||||
EvaluationMetricName.TOOL_CORRECTNESS: ToolCorrectnessMetric,
|
||||
EvaluationMetricName.TASK_COMPLETION: TaskCompletionMetric,
|
||||
}
|
||||
|
||||
metrics = []
|
||||
pairs: list[tuple[str, Any]] = []
|
||||
for name in requested_metrics:
|
||||
metric_class = metric_map.get(name)
|
||||
metric_class = deepeval_class_map.get(name)
|
||||
if metric_class:
|
||||
metrics.append(metric_class(threshold=0.5))
|
||||
pairs.append((name, metric_class(threshold=0.5)))
|
||||
else:
|
||||
logger.warning("Unknown DeepEval metric: %s", name)
|
||||
return metrics
|
||||
logger.warning("Metric '%s' is not supported by DeepEval, skipping", name)
|
||||
return pairs
|
||||
except ImportError:
|
||||
logger.warning("DeepEval metrics not available")
|
||||
return []
|
||||
|
||||
@ -4,20 +4,32 @@ from typing import Any
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.config_entity import RagasConfig
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
AGENT_METRIC_NAMES,
|
||||
LLM_METRIC_NAMES,
|
||||
RETRIEVAL_METRIC_NAMES,
|
||||
WORKFLOW_METRIC_NAMES,
|
||||
EvaluationCategory,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
EvaluationMetric,
|
||||
EvaluationMetricName,
|
||||
)
|
||||
from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Metric name mappings per category
|
||||
LLM_METRICS = ["faithfulness", "answer_relevancy", "answer_correctness", "semantic_similarity"]
|
||||
RETRIEVAL_METRICS = ["context_precision", "context_recall", "context_relevance"]
|
||||
AGENT_METRICS = ["tool_call_accuracy", "answer_correctness"]
|
||||
WORKFLOW_METRICS = ["faithfulness", "answer_correctness"]
|
||||
# Maps canonical EvaluationMetricName to the corresponding ragas metric class.
|
||||
# Metrics not listed here are unsupported by ragas and will be skipped.
|
||||
_RAGAS_METRIC_MAP: dict[EvaluationMetricName, str] = {
|
||||
EvaluationMetricName.FAITHFULNESS: "Faithfulness",
|
||||
EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancy",
|
||||
EvaluationMetricName.ANSWER_CORRECTNESS: "AnswerCorrectness",
|
||||
EvaluationMetricName.SEMANTIC_SIMILARITY: "SemanticSimilarity",
|
||||
EvaluationMetricName.CONTEXT_PRECISION: "ContextPrecision",
|
||||
EvaluationMetricName.CONTEXT_RECALL: "ContextRecall",
|
||||
EvaluationMetricName.CONTEXT_RELEVANCE: "ContextRelevance",
|
||||
EvaluationMetricName.TOOL_CORRECTNESS: "ToolCallAccuracy",
|
||||
}
|
||||
|
||||
|
||||
class RagasEvaluator(BaseEvaluationInstance):
|
||||
@ -29,15 +41,16 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
|
||||
match category:
|
||||
case EvaluationCategory.LLM:
|
||||
return LLM_METRICS
|
||||
candidates = LLM_METRIC_NAMES
|
||||
case EvaluationCategory.RETRIEVAL:
|
||||
return RETRIEVAL_METRICS
|
||||
candidates = RETRIEVAL_METRIC_NAMES
|
||||
case EvaluationCategory.AGENT:
|
||||
return AGENT_METRICS
|
||||
case EvaluationCategory.WORKFLOW:
|
||||
return WORKFLOW_METRICS
|
||||
candidates = AGENT_METRIC_NAMES
|
||||
case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
|
||||
candidates = WORKFLOW_METRIC_NAMES
|
||||
case _:
|
||||
return []
|
||||
return [m for m in candidates if m in _RAGAS_METRIC_MAP]
|
||||
|
||||
def evaluate_llm(
|
||||
self,
|
||||
@ -250,7 +263,7 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
|
||||
@staticmethod
|
||||
def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:
|
||||
"""Build RAGAS metric instances from metric names."""
|
||||
"""Build RAGAS metric instances from canonical metric names."""
|
||||
try:
|
||||
from ragas.metrics.collections import (
|
||||
AnswerCorrectness,
|
||||
@ -263,24 +276,25 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
ToolCallAccuracy,
|
||||
)
|
||||
|
||||
metric_map: dict[str, Any] = {
|
||||
"faithfulness": Faithfulness,
|
||||
"answer_relevancy": AnswerRelevancy,
|
||||
"answer_correctness": AnswerCorrectness,
|
||||
"semantic_similarity": SemanticSimilarity,
|
||||
"context_precision": ContextPrecision,
|
||||
"context_recall": ContextRecall,
|
||||
"context_relevance": ContextRelevance,
|
||||
"tool_call_accuracy": ToolCallAccuracy,
|
||||
# Maps canonical name → ragas metric class
|
||||
ragas_class_map: dict[str, Any] = {
|
||||
EvaluationMetricName.FAITHFULNESS: Faithfulness,
|
||||
EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancy,
|
||||
EvaluationMetricName.ANSWER_CORRECTNESS: AnswerCorrectness,
|
||||
EvaluationMetricName.SEMANTIC_SIMILARITY: SemanticSimilarity,
|
||||
EvaluationMetricName.CONTEXT_PRECISION: ContextPrecision,
|
||||
EvaluationMetricName.CONTEXT_RECALL: ContextRecall,
|
||||
EvaluationMetricName.CONTEXT_RELEVANCE: ContextRelevance,
|
||||
EvaluationMetricName.TOOL_CORRECTNESS: ToolCallAccuracy,
|
||||
}
|
||||
|
||||
metrics = []
|
||||
for name in requested_metrics:
|
||||
metric_class = metric_map.get(name)
|
||||
metric_class = ragas_class_map.get(name)
|
||||
if metric_class:
|
||||
metrics.append(metric_class())
|
||||
else:
|
||||
logger.warning("Unknown RAGAS metric: %s", name)
|
||||
logger.warning("Metric '%s' is not supported by RAGAS, skipping", name)
|
||||
return metrics
|
||||
except ImportError:
|
||||
logger.warning("RAGAS metrics not available")
|
||||
|
||||
@ -78,44 +78,29 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Compute agent evaluation metrics."""
|
||||
result_by_index = {r.index: r for r in results}
|
||||
merged_items = []
|
||||
for item in items:
|
||||
result = result_by_index.get(item.index)
|
||||
context = []
|
||||
if result and result.actual_output:
|
||||
context.append(result.actual_output)
|
||||
merged_items.append(
|
||||
EvaluationItemInput(
|
||||
index=item.index,
|
||||
inputs=item.inputs,
|
||||
expected_output=item.expected_output,
|
||||
context=context + (item.context or []),
|
||||
)
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_agent(
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
if not node_run_result_list:
|
||||
return []
|
||||
if not default_metric:
|
||||
raise ValueError("Default metric is required for agent evaluation")
|
||||
merged_items = self._merge_results_into_items(node_run_result_list)
|
||||
return self.evaluation_instance.evaluate_agent(
|
||||
merged_items, default_metric.metric, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back preserving metadata
|
||||
eval_by_index = {r.index: r for r in evaluated}
|
||||
final_results = []
|
||||
for result in results:
|
||||
if result.index in eval_by_index:
|
||||
eval_result = eval_by_index[result.index]
|
||||
final_results.append(
|
||||
EvaluationItemResult(
|
||||
index=result.index,
|
||||
actual_output=result.actual_output,
|
||||
metrics=eval_result.metrics,
|
||||
metadata=result.metadata,
|
||||
error=result.error,
|
||||
)
|
||||
@staticmethod
|
||||
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
|
||||
"""Create EvaluationItemInput list from NodeRunResult for agent evaluation."""
|
||||
merged = []
|
||||
for i, item in enumerate(items):
|
||||
output = _extract_agent_output(item.outputs)
|
||||
merged.append(
|
||||
EvaluationItemInput(
|
||||
index=i,
|
||||
inputs=dict(item.inputs),
|
||||
output=output,
|
||||
)
|
||||
else:
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
)
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _extract_query(inputs: dict[str, Any]) -> str:
|
||||
@ -157,3 +142,13 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
logger.exception("Error consuming agent stream")
|
||||
|
||||
return "".join(answer_parts), tool_calls
|
||||
|
||||
|
||||
def _extract_agent_output(outputs: Mapping[str, Any]) -> str:
|
||||
"""Extract the primary output text from agent NodeRunResult.outputs."""
|
||||
if "answer" in outputs:
|
||||
return str(outputs["answer"])
|
||||
if "text" in outputs:
|
||||
return str(outputs["text"])
|
||||
values = list(outputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
|
||||
@ -63,7 +63,8 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
@staticmethod
|
||||
def _extract_query(inputs: dict[str, Any]) -> str:
|
||||
for key in "query":
|
||||
for key in ("query", "question", "input", "text"):
|
||||
if key in inputs:
|
||||
return str(inputs[key])
|
||||
return ""
|
||||
values = list(inputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
|
||||
@ -109,44 +109,29 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
|
||||
Snippets are essentially workflows, so we reuse evaluate_workflow from
|
||||
the evaluation instance.
|
||||
"""
|
||||
result_by_index = {r.index: r for r in results}
|
||||
merged_items = []
|
||||
for item in items:
|
||||
result = result_by_index.get(item.index)
|
||||
context = []
|
||||
if result and result.actual_output:
|
||||
context.append(result.actual_output)
|
||||
merged_items.append(
|
||||
EvaluationItemInput(
|
||||
index=item.index,
|
||||
inputs=item.inputs,
|
||||
expected_output=item.expected_output,
|
||||
context=context + (item.context or []),
|
||||
)
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_workflow(
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
if not node_run_result_list:
|
||||
return []
|
||||
if not default_metric:
|
||||
raise ValueError("Default metric is required for snippet evaluation")
|
||||
merged_items = self._merge_results_into_items(node_run_result_list)
|
||||
return self.evaluation_instance.evaluate_workflow(
|
||||
merged_items, default_metric.metric, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back preserving metadata from Phase 1
|
||||
eval_by_index = {r.index: r for r in evaluated}
|
||||
final_results = []
|
||||
for result in results:
|
||||
if result.index in eval_by_index:
|
||||
eval_result = eval_by_index[result.index]
|
||||
final_results.append(
|
||||
EvaluationItemResult(
|
||||
index=result.index,
|
||||
actual_output=result.actual_output,
|
||||
metrics=eval_result.metrics,
|
||||
metadata=result.metadata,
|
||||
error=result.error,
|
||||
)
|
||||
@staticmethod
|
||||
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
|
||||
"""Create EvaluationItemInput list from NodeRunResult for snippet evaluation."""
|
||||
merged = []
|
||||
for i, item in enumerate(items):
|
||||
output = _extract_snippet_output(item.outputs)
|
||||
merged.append(
|
||||
EvaluationItemInput(
|
||||
index=i,
|
||||
inputs=dict(item.inputs),
|
||||
output=output,
|
||||
)
|
||||
else:
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
)
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _extract_output(response: Mapping[str, Any]) -> str:
|
||||
@ -235,3 +220,13 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
|
||||
"error": node.error,
|
||||
"elapsed_time": node.elapsed_time,
|
||||
}
|
||||
|
||||
|
||||
def _extract_snippet_output(outputs: Mapping[str, Any]) -> str:
|
||||
"""Extract the primary output text from snippet NodeRunResult.outputs."""
|
||||
if "answer" in outputs:
|
||||
return str(outputs["answer"])
|
||||
if "text" in outputs:
|
||||
return str(outputs["text"])
|
||||
values = list(outputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
|
||||
@ -34,44 +34,29 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Compute workflow evaluation metrics (end-to-end)."""
|
||||
result_by_index = {r.index: r for r in results}
|
||||
merged_items = []
|
||||
for item in items:
|
||||
result = result_by_index.get(item.index)
|
||||
context = []
|
||||
if result and result.actual_output:
|
||||
context.append(result.actual_output)
|
||||
merged_items.append(
|
||||
EvaluationItemInput(
|
||||
index=item.index,
|
||||
inputs=item.inputs,
|
||||
expected_output=item.expected_output,
|
||||
context=context + (item.context or []),
|
||||
)
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_workflow(
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
if not node_run_result_list:
|
||||
return []
|
||||
if not default_metric:
|
||||
raise ValueError("Default metric is required for workflow evaluation")
|
||||
merged_items = self._merge_results_into_items(node_run_result_list)
|
||||
return self.evaluation_instance.evaluate_workflow(
|
||||
merged_items, default_metric.metric, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back preserving metadata
|
||||
eval_by_index = {r.index: r for r in evaluated}
|
||||
final_results = []
|
||||
for result in results:
|
||||
if result.index in eval_by_index:
|
||||
eval_result = eval_by_index[result.index]
|
||||
final_results.append(
|
||||
EvaluationItemResult(
|
||||
index=result.index,
|
||||
actual_output=result.actual_output,
|
||||
metrics=eval_result.metrics,
|
||||
metadata=result.metadata,
|
||||
error=result.error,
|
||||
)
|
||||
@staticmethod
|
||||
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
|
||||
"""Create EvaluationItemInput list from NodeRunResult for workflow evaluation."""
|
||||
merged = []
|
||||
for i, item in enumerate(items):
|
||||
output = _extract_workflow_output(item.outputs)
|
||||
merged.append(
|
||||
EvaluationItemInput(
|
||||
index=i,
|
||||
inputs=dict(item.inputs),
|
||||
output=output,
|
||||
)
|
||||
else:
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
)
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _extract_output(response: Mapping[str, Any]) -> str:
|
||||
@ -91,3 +76,13 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
if isinstance(data, Mapping):
|
||||
return data.get("node_executions", [])
|
||||
return []
|
||||
|
||||
|
||||
def _extract_workflow_output(outputs: Mapping[str, Any]) -> str:
|
||||
"""Extract the primary output text from workflow NodeRunResult.outputs."""
|
||||
if "answer" in outputs:
|
||||
return str(outputs["answer"])
|
||||
if "text" in outputs:
|
||||
return str(outputs["text"])
|
||||
values = list(outputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
|
||||
@ -105,6 +105,7 @@ class EvaluationRun(Base):
|
||||
error: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True)
|
||||
|
||||
created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
|
||||
@ -12,6 +12,7 @@ from configs import dify_config
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationCategory,
|
||||
EvaluationDatasetInput,
|
||||
EvaluationItemResult,
|
||||
EvaluationRunData,
|
||||
)
|
||||
@ -88,23 +89,23 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
)
|
||||
|
||||
results: list[EvaluationItemResult] = _execute_evaluation_runner(
|
||||
session,
|
||||
run_data,
|
||||
evaluation_instance,
|
||||
node_run_result_mapping_list,
|
||||
session=session,
|
||||
run_data=run_data,
|
||||
evaluation_instance=evaluation_instance,
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
)
|
||||
|
||||
# Compute summary metrics
|
||||
metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)
|
||||
|
||||
# Generate result XLSX
|
||||
result_xlsx = _generate_result_xlsx(run_data.items, results)
|
||||
result_xlsx = _generate_result_xlsx(run_data.input_list, results)
|
||||
|
||||
# Store result file
|
||||
result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session)
|
||||
|
||||
# Update run to completed
|
||||
evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
|
||||
evaluation_run: EvaluationRun = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
|
||||
if evaluation_run:
|
||||
evaluation_run.status = EvaluationRunStatus.COMPLETED
|
||||
evaluation_run.completed_at = naive_utc_now()
|
||||
@ -232,10 +233,10 @@ def _compute_metrics_summary(
|
||||
|
||||
|
||||
def _generate_result_xlsx(
|
||||
items: list[Any],
|
||||
input_list: list[EvaluationDatasetInput],
|
||||
results: list[EvaluationItemResult],
|
||||
) -> bytes:
|
||||
"""Generate result XLSX with input data, actual output, and metric scores."""
|
||||
"""Generate result XLSX with input data, actual output, metric scores, and judgment."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
if ws is None:
|
||||
@ -261,14 +262,18 @@ def _generate_result_xlsx(
|
||||
|
||||
# Collect all input keys
|
||||
input_keys: list[str] = []
|
||||
for item in items:
|
||||
for item in input_list:
|
||||
for key in item.inputs:
|
||||
if key not in input_keys:
|
||||
input_keys.append(key)
|
||||
|
||||
# Include judgment column only when at least one result has judgment conditions evaluated
|
||||
has_judgment = any(bool(r.judgment.condition_results) for r in results)
|
||||
|
||||
# Build headers
|
||||
judgment_headers = ["judgment"] if has_judgment else []
|
||||
headers = (
|
||||
["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
|
||||
["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + judgment_headers + ["error"]
|
||||
)
|
||||
|
||||
# Write header row
|
||||
@ -288,7 +293,7 @@ def _generate_result_xlsx(
|
||||
result_by_index = {r.index: r for r in results}
|
||||
|
||||
# Write data rows
|
||||
for row_idx, item in enumerate(items, start=2):
|
||||
for row_idx, item in enumerate(input_list, start=2):
|
||||
result = result_by_index.get(item.index)
|
||||
|
||||
col = 1
|
||||
@ -317,9 +322,14 @@ def _generate_result_xlsx(
|
||||
ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
|
||||
col += 1
|
||||
|
||||
# Overall score
|
||||
ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border
|
||||
col += 1
|
||||
# Judgment result
|
||||
if has_judgment:
|
||||
if result and result.judgment.condition_results:
|
||||
judgment_value = "Pass" if result.judgment.passed else "Fail"
|
||||
else:
|
||||
judgment_value = ""
|
||||
ws.cell(row=row_idx, column=col, value=judgment_value).border = thin_border
|
||||
col += 1
|
||||
|
||||
# Error
|
||||
ws.cell(row=row_idx, column=col, value=result.error if result else "").border = thin_border
|
||||
|
||||
Reference in New Issue
Block a user