evaluation runtime

This commit is contained in:
jyong
2026-03-17 15:26:39 +08:00
parent 751c938d8a
commit f692def738
9 changed files with 258 additions and 182 deletions

View File

@ -15,6 +15,56 @@ class EvaluationCategory(StrEnum):
RETRIEVAL_TEST = "retrieval_test"
class EvaluationMetricName(StrEnum):
"""Canonical metric names shared across all evaluation frameworks.
Each framework maps these names to its own internal implementation.
A framework that does not support a given metric should log a warning
and skip it rather than raising an error.
"""
# LLM / general text-quality metrics
FAITHFULNESS = "faithfulness"
ANSWER_RELEVANCY = "answer_relevancy"
ANSWER_CORRECTNESS = "answer_correctness"
SEMANTIC_SIMILARITY = "semantic_similarity"
# Retrieval-quality metrics
CONTEXT_PRECISION = "context_precision"
CONTEXT_RECALL = "context_recall"
CONTEXT_RELEVANCE = "context_relevance"
# Agent-quality metrics
TOOL_CORRECTNESS = "tool_correctness"
TASK_COMPLETION = "task_completion"
# Per-category canonical metric lists used by get_supported_metrics().
LLM_METRIC_NAMES: list[EvaluationMetricName] = [
EvaluationMetricName.FAITHFULNESS,
EvaluationMetricName.ANSWER_RELEVANCY,
EvaluationMetricName.ANSWER_CORRECTNESS,
EvaluationMetricName.SEMANTIC_SIMILARITY,
]
RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [
EvaluationMetricName.CONTEXT_PRECISION,
EvaluationMetricName.CONTEXT_RECALL,
EvaluationMetricName.CONTEXT_RELEVANCE,
]
AGENT_METRIC_NAMES: list[EvaluationMetricName] = [
EvaluationMetricName.TOOL_CORRECTNESS,
EvaluationMetricName.TASK_COMPLETION,
]
WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [
EvaluationMetricName.FAITHFULNESS,
EvaluationMetricName.ANSWER_RELEVANCY,
EvaluationMetricName.ANSWER_CORRECTNESS,
]
class EvaluationMetric(BaseModel):
name: str
value: Any

View File

@ -4,30 +4,39 @@ from typing import Any
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.config_entity import DeepEvalConfig
from core.evaluation.entities.evaluation_entity import (
AGENT_METRIC_NAMES,
LLM_METRIC_NAMES,
RETRIEVAL_METRIC_NAMES,
WORKFLOW_METRIC_NAMES,
EvaluationCategory,
EvaluationItemInput,
EvaluationItemResult,
EvaluationMetric,
EvaluationMetricName,
)
from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
logger = logging.getLogger(__name__)
# Metric name mappings per category
#
# Maps canonical EvaluationMetricName to the corresponding deepeval metric class name.
# deepeval metric field requirements (LLMTestCase fields):
# - faithfulness: input, actual_output, retrieval_context
# - answer_relevancy: input, actual_output
# - contextual_precision: input, actual_output, expected_output, retrieval_context
# - contextual_recall: input, actual_output, expected_output, retrieval_context
# - contextual_relevancy: input, actual_output, retrieval_context
# - hallucination: input, actual_output, context
# - tool_correctness: input, actual_output, expected_tools
# - task_completion: input, actual_output
LLM_METRICS = ["faithfulness", "answer_relevancy"]
RETRIEVAL_METRICS = ["contextual_precision", "contextual_recall", "contextual_relevancy"]
AGENT_METRICS = ["tool_correctness", "task_completion"]
WORKFLOW_METRICS = ["faithfulness", "answer_relevancy"]
# - faithfulness: input, actual_output, retrieval_context
# - answer_relevancy: input, actual_output
# - context_precision: input, actual_output, expected_output, retrieval_context
# - context_recall: input, actual_output, expected_output, retrieval_context
# - context_relevance: input, actual_output, retrieval_context
# - tool_correctness: input, actual_output, expected_tools
# - task_completion: input, actual_output
# Metrics not listed here are unsupported by deepeval and will be skipped.
_DEEPEVAL_METRIC_MAP: dict[EvaluationMetricName, str] = {
EvaluationMetricName.FAITHFULNESS: "FaithfulnessMetric",
EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancyMetric",
EvaluationMetricName.CONTEXT_PRECISION: "ContextualPrecisionMetric",
EvaluationMetricName.CONTEXT_RECALL: "ContextualRecallMetric",
EvaluationMetricName.CONTEXT_RELEVANCE: "ContextualRelevancyMetric",
EvaluationMetricName.TOOL_CORRECTNESS: "ToolCorrectnessMetric",
EvaluationMetricName.TASK_COMPLETION: "TaskCompletionMetric",
}
class DeepEvalEvaluator(BaseEvaluationInstance):
@ -39,15 +48,16 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
match category:
case EvaluationCategory.LLM:
return LLM_METRICS
candidates = LLM_METRIC_NAMES
case EvaluationCategory.RETRIEVAL:
return RETRIEVAL_METRICS
candidates = RETRIEVAL_METRIC_NAMES
case EvaluationCategory.AGENT:
return AGENT_METRICS
case EvaluationCategory.WORKFLOW:
return WORKFLOW_METRICS
candidates = AGENT_METRIC_NAMES
case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
candidates = WORKFLOW_METRIC_NAMES
case _:
return []
return [m for m in candidates if m in _DEEPEVAL_METRIC_MAP]
def evaluate_llm(
self,
@ -121,8 +131,8 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
- Retrieval: input=query, actual_output=output, expected_output, retrieval_context=context
- Agent: input=query, actual_output=output
"""
deepeval_metrics = _build_deepeval_metrics(requested_metrics)
if not deepeval_metrics:
metric_pairs = _build_deepeval_metrics(requested_metrics)
if not metric_pairs:
logger.warning("No valid DeepEval metrics found for: %s", requested_metrics)
return [EvaluationItemResult(index=item.index) for item in items]
@ -130,15 +140,15 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
for item in items:
test_case = self._build_test_case(item, category)
metrics: list[EvaluationMetric] = []
for metric in deepeval_metrics:
for canonical_name, metric in metric_pairs:
try:
metric.measure(test_case)
if metric.score is not None:
metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score)))
metrics.append(EvaluationMetric(name=canonical_name, value=float(metric.score)))
except Exception:
logger.exception(
"Failed to compute metric %s for item %d",
metric.__class__.__name__,
canonical_name,
item.index,
)
results.append(EvaluationItemResult(index=item.index, metrics=metrics))
@ -248,8 +258,12 @@ def _format_input(inputs: dict[str, Any], category: EvaluationCategory) -> str:
return str(next(iter(inputs.values()), "")) if inputs else ""
def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
"""Build DeepEval metric instances from metric names."""
def _build_deepeval_metrics(requested_metrics: list[str]) -> list[tuple[str, Any]]:
"""Build DeepEval metric instances from canonical metric names.
Returns a list of (canonical_name, metric_instance) pairs so that callers
can record the canonical name rather than the framework-internal class name.
"""
try:
from deepeval.metrics import (
AnswerRelevancyMetric,
@ -261,24 +275,25 @@ def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
ToolCorrectnessMetric,
)
metric_map: dict[str, Any] = {
"faithfulness": FaithfulnessMetric,
"answer_relevancy": AnswerRelevancyMetric,
"contextual_precision": ContextualPrecisionMetric,
"contextual_recall": ContextualRecallMetric,
"contextual_relevancy": ContextualRelevancyMetric,
"tool_correctness": ToolCorrectnessMetric,
"task_completion": TaskCompletionMetric,
# Maps canonical name → deepeval metric class
deepeval_class_map: dict[str, Any] = {
EvaluationMetricName.FAITHFULNESS: FaithfulnessMetric,
EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancyMetric,
EvaluationMetricName.CONTEXT_PRECISION: ContextualPrecisionMetric,
EvaluationMetricName.CONTEXT_RECALL: ContextualRecallMetric,
EvaluationMetricName.CONTEXT_RELEVANCE: ContextualRelevancyMetric,
EvaluationMetricName.TOOL_CORRECTNESS: ToolCorrectnessMetric,
EvaluationMetricName.TASK_COMPLETION: TaskCompletionMetric,
}
metrics = []
pairs: list[tuple[str, Any]] = []
for name in requested_metrics:
metric_class = metric_map.get(name)
metric_class = deepeval_class_map.get(name)
if metric_class:
metrics.append(metric_class(threshold=0.5))
pairs.append((name, metric_class(threshold=0.5)))
else:
logger.warning("Unknown DeepEval metric: %s", name)
return metrics
logger.warning("Metric '%s' is not supported by DeepEval, skipping", name)
return pairs
except ImportError:
logger.warning("DeepEval metrics not available")
return []

View File

@ -4,20 +4,32 @@ from typing import Any
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.config_entity import RagasConfig
from core.evaluation.entities.evaluation_entity import (
AGENT_METRIC_NAMES,
LLM_METRIC_NAMES,
RETRIEVAL_METRIC_NAMES,
WORKFLOW_METRIC_NAMES,
EvaluationCategory,
EvaluationItemInput,
EvaluationItemResult,
EvaluationMetric,
EvaluationMetricName,
)
from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
logger = logging.getLogger(__name__)
# Metric name mappings per category
LLM_METRICS = ["faithfulness", "answer_relevancy", "answer_correctness", "semantic_similarity"]
RETRIEVAL_METRICS = ["context_precision", "context_recall", "context_relevance"]
AGENT_METRICS = ["tool_call_accuracy", "answer_correctness"]
WORKFLOW_METRICS = ["faithfulness", "answer_correctness"]
# Maps canonical EvaluationMetricName to the corresponding ragas metric class.
# Metrics not listed here are unsupported by ragas and will be skipped.
_RAGAS_METRIC_MAP: dict[EvaluationMetricName, str] = {
EvaluationMetricName.FAITHFULNESS: "Faithfulness",
EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancy",
EvaluationMetricName.ANSWER_CORRECTNESS: "AnswerCorrectness",
EvaluationMetricName.SEMANTIC_SIMILARITY: "SemanticSimilarity",
EvaluationMetricName.CONTEXT_PRECISION: "ContextPrecision",
EvaluationMetricName.CONTEXT_RECALL: "ContextRecall",
EvaluationMetricName.CONTEXT_RELEVANCE: "ContextRelevance",
EvaluationMetricName.TOOL_CORRECTNESS: "ToolCallAccuracy",
}
class RagasEvaluator(BaseEvaluationInstance):
@ -29,15 +41,16 @@ class RagasEvaluator(BaseEvaluationInstance):
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
match category:
case EvaluationCategory.LLM:
return LLM_METRICS
candidates = LLM_METRIC_NAMES
case EvaluationCategory.RETRIEVAL:
return RETRIEVAL_METRICS
candidates = RETRIEVAL_METRIC_NAMES
case EvaluationCategory.AGENT:
return AGENT_METRICS
case EvaluationCategory.WORKFLOW:
return WORKFLOW_METRICS
candidates = AGENT_METRIC_NAMES
case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
candidates = WORKFLOW_METRIC_NAMES
case _:
return []
return [m for m in candidates if m in _RAGAS_METRIC_MAP]
def evaluate_llm(
self,
@ -250,7 +263,7 @@ class RagasEvaluator(BaseEvaluationInstance):
@staticmethod
def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:
"""Build RAGAS metric instances from metric names."""
"""Build RAGAS metric instances from canonical metric names."""
try:
from ragas.metrics.collections import (
AnswerCorrectness,
@ -263,24 +276,25 @@ class RagasEvaluator(BaseEvaluationInstance):
ToolCallAccuracy,
)
metric_map: dict[str, Any] = {
"faithfulness": Faithfulness,
"answer_relevancy": AnswerRelevancy,
"answer_correctness": AnswerCorrectness,
"semantic_similarity": SemanticSimilarity,
"context_precision": ContextPrecision,
"context_recall": ContextRecall,
"context_relevance": ContextRelevance,
"tool_call_accuracy": ToolCallAccuracy,
# Maps canonical name → ragas metric class
ragas_class_map: dict[str, Any] = {
EvaluationMetricName.FAITHFULNESS: Faithfulness,
EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancy,
EvaluationMetricName.ANSWER_CORRECTNESS: AnswerCorrectness,
EvaluationMetricName.SEMANTIC_SIMILARITY: SemanticSimilarity,
EvaluationMetricName.CONTEXT_PRECISION: ContextPrecision,
EvaluationMetricName.CONTEXT_RECALL: ContextRecall,
EvaluationMetricName.CONTEXT_RELEVANCE: ContextRelevance,
EvaluationMetricName.TOOL_CORRECTNESS: ToolCallAccuracy,
}
metrics = []
for name in requested_metrics:
metric_class = metric_map.get(name)
metric_class = ragas_class_map.get(name)
if metric_class:
metrics.append(metric_class())
else:
logger.warning("Unknown RAGAS metric: %s", name)
logger.warning("Metric '%s' is not supported by RAGAS, skipping", name)
return metrics
except ImportError:
logger.warning("RAGAS metrics not available")

View File

@ -78,44 +78,29 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
tenant_id: str,
) -> list[EvaluationItemResult]:
"""Compute agent evaluation metrics."""
result_by_index = {r.index: r for r in results}
merged_items = []
for item in items:
result = result_by_index.get(item.index)
context = []
if result and result.actual_output:
context.append(result.actual_output)
merged_items.append(
EvaluationItemInput(
index=item.index,
inputs=item.inputs,
expected_output=item.expected_output,
context=context + (item.context or []),
)
)
evaluated = self.evaluation_instance.evaluate_agent(
merged_items, default_metrics, model_provider, model_name, tenant_id
if not node_run_result_list:
return []
if not default_metric:
raise ValueError("Default metric is required for agent evaluation")
merged_items = self._merge_results_into_items(node_run_result_list)
return self.evaluation_instance.evaluate_agent(
merged_items, default_metric.metric, model_provider, model_name, tenant_id
)
# Merge metrics back preserving metadata
eval_by_index = {r.index: r for r in evaluated}
final_results = []
for result in results:
if result.index in eval_by_index:
eval_result = eval_by_index[result.index]
final_results.append(
EvaluationItemResult(
index=result.index,
actual_output=result.actual_output,
metrics=eval_result.metrics,
metadata=result.metadata,
error=result.error,
)
@staticmethod
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
"""Create EvaluationItemInput list from NodeRunResult for agent evaluation."""
merged = []
for i, item in enumerate(items):
output = _extract_agent_output(item.outputs)
merged.append(
EvaluationItemInput(
index=i,
inputs=dict(item.inputs),
output=output,
)
else:
final_results.append(result)
return final_results
)
return merged
@staticmethod
def _extract_query(inputs: dict[str, Any]) -> str:
@ -157,3 +142,13 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
logger.exception("Error consuming agent stream")
return "".join(answer_parts), tool_calls
def _extract_agent_output(outputs: Mapping[str, Any]) -> str:
"""Extract the primary output text from agent NodeRunResult.outputs."""
if "answer" in outputs:
return str(outputs["answer"])
if "text" in outputs:
return str(outputs["text"])
values = list(outputs.values())
return str(values[0]) if values else ""

View File

@ -63,7 +63,8 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
@staticmethod
def _extract_query(inputs: dict[str, Any]) -> str:
for key in "query":
for key in ("query", "question", "input", "text"):
if key in inputs:
return str(inputs[key])
return ""
values = list(inputs.values())
return str(values[0]) if values else ""

View File

@ -109,44 +109,29 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
Snippets are essentially workflows, so we reuse evaluate_workflow from
the evaluation instance.
"""
result_by_index = {r.index: r for r in results}
merged_items = []
for item in items:
result = result_by_index.get(item.index)
context = []
if result and result.actual_output:
context.append(result.actual_output)
merged_items.append(
EvaluationItemInput(
index=item.index,
inputs=item.inputs,
expected_output=item.expected_output,
context=context + (item.context or []),
)
)
evaluated = self.evaluation_instance.evaluate_workflow(
merged_items, default_metrics, model_provider, model_name, tenant_id
if not node_run_result_list:
return []
if not default_metric:
raise ValueError("Default metric is required for snippet evaluation")
merged_items = self._merge_results_into_items(node_run_result_list)
return self.evaluation_instance.evaluate_workflow(
merged_items, default_metric.metric, model_provider, model_name, tenant_id
)
# Merge metrics back preserving metadata from Phase 1
eval_by_index = {r.index: r for r in evaluated}
final_results = []
for result in results:
if result.index in eval_by_index:
eval_result = eval_by_index[result.index]
final_results.append(
EvaluationItemResult(
index=result.index,
actual_output=result.actual_output,
metrics=eval_result.metrics,
metadata=result.metadata,
error=result.error,
)
@staticmethod
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
"""Create EvaluationItemInput list from NodeRunResult for snippet evaluation."""
merged = []
for i, item in enumerate(items):
output = _extract_snippet_output(item.outputs)
merged.append(
EvaluationItemInput(
index=i,
inputs=dict(item.inputs),
output=output,
)
else:
final_results.append(result)
return final_results
)
return merged
@staticmethod
def _extract_output(response: Mapping[str, Any]) -> str:
@ -235,3 +220,13 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
"error": node.error,
"elapsed_time": node.elapsed_time,
}
def _extract_snippet_output(outputs: Mapping[str, Any]) -> str:
"""Extract the primary output text from snippet NodeRunResult.outputs."""
if "answer" in outputs:
return str(outputs["answer"])
if "text" in outputs:
return str(outputs["text"])
values = list(outputs.values())
return str(values[0]) if values else ""

View File

@ -34,44 +34,29 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
tenant_id: str,
) -> list[EvaluationItemResult]:
"""Compute workflow evaluation metrics (end-to-end)."""
result_by_index = {r.index: r for r in results}
merged_items = []
for item in items:
result = result_by_index.get(item.index)
context = []
if result and result.actual_output:
context.append(result.actual_output)
merged_items.append(
EvaluationItemInput(
index=item.index,
inputs=item.inputs,
expected_output=item.expected_output,
context=context + (item.context or []),
)
)
evaluated = self.evaluation_instance.evaluate_workflow(
merged_items, default_metrics, model_provider, model_name, tenant_id
if not node_run_result_list:
return []
if not default_metric:
raise ValueError("Default metric is required for workflow evaluation")
merged_items = self._merge_results_into_items(node_run_result_list)
return self.evaluation_instance.evaluate_workflow(
merged_items, default_metric.metric, model_provider, model_name, tenant_id
)
# Merge metrics back preserving metadata
eval_by_index = {r.index: r for r in evaluated}
final_results = []
for result in results:
if result.index in eval_by_index:
eval_result = eval_by_index[result.index]
final_results.append(
EvaluationItemResult(
index=result.index,
actual_output=result.actual_output,
metrics=eval_result.metrics,
metadata=result.metadata,
error=result.error,
)
@staticmethod
def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
"""Create EvaluationItemInput list from NodeRunResult for workflow evaluation."""
merged = []
for i, item in enumerate(items):
output = _extract_workflow_output(item.outputs)
merged.append(
EvaluationItemInput(
index=i,
inputs=dict(item.inputs),
output=output,
)
else:
final_results.append(result)
return final_results
)
return merged
@staticmethod
def _extract_output(response: Mapping[str, Any]) -> str:
@ -91,3 +76,13 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
if isinstance(data, Mapping):
return data.get("node_executions", [])
return []
def _extract_workflow_output(outputs: Mapping[str, Any]) -> str:
"""Extract the primary output text from workflow NodeRunResult.outputs."""
if "answer" in outputs:
return str(outputs["answer"])
if "text" in outputs:
return str(outputs["text"])
values = list(outputs.values())
return str(values[0]) if values else ""

View File

@ -105,6 +105,7 @@ class EvaluationRun(Base):
error: Mapped[str | None] = mapped_column(Text, nullable=True)
celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True)
created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)

View File

@ -12,6 +12,7 @@ from configs import dify_config
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
EvaluationCategory,
EvaluationDatasetInput,
EvaluationItemResult,
EvaluationRunData,
)
@ -88,23 +89,23 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
)
results: list[EvaluationItemResult] = _execute_evaluation_runner(
session,
run_data,
evaluation_instance,
node_run_result_mapping_list,
session=session,
run_data=run_data,
evaluation_instance=evaluation_instance,
node_run_result_mapping_list=node_run_result_mapping_list,
)
# Compute summary metrics
metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)
# Generate result XLSX
result_xlsx = _generate_result_xlsx(run_data.items, results)
result_xlsx = _generate_result_xlsx(run_data.input_list, results)
# Store result file
result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session)
# Update run to completed
evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
evaluation_run: EvaluationRun = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
if evaluation_run:
evaluation_run.status = EvaluationRunStatus.COMPLETED
evaluation_run.completed_at = naive_utc_now()
@ -232,10 +233,10 @@ def _compute_metrics_summary(
def _generate_result_xlsx(
items: list[Any],
input_list: list[EvaluationDatasetInput],
results: list[EvaluationItemResult],
) -> bytes:
"""Generate result XLSX with input data, actual output, and metric scores."""
"""Generate result XLSX with input data, actual output, metric scores, and judgment."""
wb = Workbook()
ws = wb.active
if ws is None:
@ -261,14 +262,18 @@ def _generate_result_xlsx(
# Collect all input keys
input_keys: list[str] = []
for item in items:
for item in input_list:
for key in item.inputs:
if key not in input_keys:
input_keys.append(key)
# Include judgment column only when at least one result has judgment conditions evaluated
has_judgment = any(bool(r.judgment.condition_results) for r in results)
# Build headers
judgment_headers = ["judgment"] if has_judgment else []
headers = (
["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + judgment_headers + ["error"]
)
# Write header row
@ -288,7 +293,7 @@ def _generate_result_xlsx(
result_by_index = {r.index: r for r in results}
# Write data rows
for row_idx, item in enumerate(items, start=2):
for row_idx, item in enumerate(input_list, start=2):
result = result_by_index.get(item.index)
col = 1
@ -317,9 +322,14 @@ def _generate_result_xlsx(
ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
col += 1
# Overall score
ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border
col += 1
# Judgment result
if has_judgment:
if result and result.judgment.condition_results:
judgment_value = "Pass" if result.judgment.passed else "Fail"
else:
judgment_value = ""
ws.cell(row=row_idx, column=col, value=judgment_value).border = thin_border
col += 1
# Error
ws.cell(row=row_idx, column=col, value=result.error if result else "").border = thin_border