evaluation runtime

This commit is contained in:
jyong
2026-03-13 16:54:23 +08:00
parent c0fac68f2d
commit 2ed0805c13
3 changed files with 24 additions and 32 deletions

View File

@ -41,14 +41,6 @@ class EvaluationItemResult(BaseModel):
metrics: list[EvaluationMetric] = Field(default_factory=list)
error: str | None = None
@property
def overall_score(self) -> float | None:
if not self.metrics:
return None
scores = [m.score for m in self.metrics]
return sum(scores) / len(scores)
class NodeInfo(BaseModel):
node_id: str
type: str

View File

@ -15,7 +15,6 @@ from core.evaluation.entities.evaluation_entity import (
EvaluationCategory,
EvaluationConfigData,
EvaluationDatasetInput,
EvaluationItemInput,
EvaluationRunData,
EvaluationRunRequest,
)
@ -156,6 +155,8 @@ class EvaluationService:
"""
wb = Workbook()
ws = wb.active
if ws is None:
ws = wb.create_sheet("Evaluation Dataset")
sheet_name = "Evaluation Dataset"
ws.title = sheet_name
@ -174,7 +175,7 @@ class EvaluationService:
headers = ["index"]
for field in input_fields:
field_label = field.get("label") or field.get("variable")
field_label = str(field.get("label") or field.get("variable") or "")
headers.append(field_label)
# Write header row
@ -279,9 +280,6 @@ class EvaluationService:
if evaluation_instance is None:
raise EvaluationFrameworkNotConfiguredError()
# Derive evaluation_category from default_metrics node types
evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics)
# Save as latest EvaluationConfiguration
config = cls.save_evaluation_config(
session=session,
@ -333,12 +331,10 @@ class EvaluationService:
target_id=target_id,
evaluation_model_provider=run_request.evaluation_model_provider,
evaluation_model=run_request.evaluation_model,
default_metrics=[m.model_dump() for m in run_request.default_metrics],
customized_metrics=(
run_request.customized_metrics.model_dump() if run_request.customized_metrics else None
),
default_metrics=run_request.default_metrics,
customized_metrics=run_request.customized_metrics,
judgment_config=run_request.judgment_config,
items=items,
input_list=items,
)
# Dispatch Celery task
@ -648,7 +644,7 @@ class EvaluationService:
# ---- Dataset Parsing ----
@classmethod
def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationItemInput]:
def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]:
"""Parse evaluation dataset from XLSX bytes."""
wb = load_workbook(io.BytesIO(xlsx_content), read_only=True)
ws = wb.active
@ -672,7 +668,7 @@ class EvaluationService:
index_val = values[0] if values else row_idx
try:
index = int(index_val)
index = int(str(index_val))
except (TypeError, ValueError):
index = row_idx
@ -681,17 +677,14 @@ class EvaluationService:
val = values[col_idx + 1] if col_idx + 1 < len(values) else None
inputs[header] = str(val) if val is not None else ""
# Check for expected_output column
# Extract expected_output column into dedicated field
expected_output = inputs.pop("expected_output", None)
context_str = inputs.pop("context", None)
context = context_str.split(";") if context_str else None
items.append(
EvaluationItemInput(
EvaluationDatasetInput(
index=index,
inputs=inputs,
expected_output=expected_output,
context=context,
)
)

View File

@ -25,6 +25,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
from core.workflow.node_events.base import NodeRunResult
from extensions.ext_database import db
from libs.datetime_utils import naive_utc_now
from models.enums import CreatorUserRole
from models.evaluation import EvaluationRun, EvaluationRunStatus
from models.model import UploadFile
from services.evaluation_service import EvaluationService
@ -116,6 +117,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
logger.info("Evaluation run %s completed successfully", run_data.evaluation_run_id)
def _execute_evaluation_runner(
session: Any,
run_data: EvaluationRunData,
@ -125,6 +127,7 @@ def _execute_evaluation_runner(
"""Execute the evaluation runner."""
default_metrics = run_data.default_metrics
customized_metrics = run_data.customized_metrics
results: list[EvaluationItemResult] = []
for default_metric in default_metrics:
for node_info in default_metric.node_info_list:
node_run_result_list: list[NodeRunResult] = []
@ -134,7 +137,7 @@ def _execute_evaluation_runner(
node_run_result_list.append(node_run_result)
if node_run_result_list:
runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session)
runner.run(
results.extend(runner.run(
evaluation_run_id=run_data.evaluation_run_id,
tenant_id=run_data.tenant_id,
target_id=run_data.target_id,
@ -144,10 +147,10 @@ def _execute_evaluation_runner(
model_provider=run_data.evaluation_model_provider,
model_name=run_data.evaluation_model,
node_run_result_list=node_run_result_list,
)
))
if customized_metrics:
runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session)
runner.run(
results.extend(runner.run(
evaluation_run_id=run_data.evaluation_run_id,
tenant_id=run_data.tenant_id,
target_id=run_data.target_id,
@ -156,7 +159,9 @@ def _execute_evaluation_runner(
customized_metrics=customized_metrics,
node_run_result_list=None,
node_run_result_mapping_list=node_run_result_mapping_list,
)
))
return results
def _create_runner(
category: EvaluationCategory,
@ -201,7 +206,7 @@ def _compute_metrics_summary(results: list[EvaluationItemResult]) -> dict[str, A
for metric in result.metrics:
if metric.name not in metric_scores:
metric_scores[metric.name] = []
metric_scores[metric.name].append(metric.score)
metric_scores[metric.name].append(float(metric.value))
summary: dict[str, Any] = {}
for name, scores in metric_scores.items():
@ -231,6 +236,8 @@ def _generate_result_xlsx(
"""Generate result XLSX with input data, actual output, and metric scores."""
wb = Workbook()
ws = wb.active
if ws is None:
ws = wb.create_sheet("Evaluation Results")
ws.title = "Evaluation Results"
header_font = Font(bold=True, color="FFFFFF")
@ -306,7 +313,7 @@ def _generate_result_xlsx(
col += 1
# Metric scores
metric_scores = {m.name: m.score for m in result.metrics} if result else {}
metric_scores = {m.name: m.value for m in result.metrics} if result else {}
for metric_name in all_metric_names:
score = metric_scores.get(metric_name)
ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
@ -351,7 +358,7 @@ def _store_result_file(
size=len(xlsx_content),
extension="xlsx",
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
created_by_role="account",
created_by_role=CreatorUserRole.ACCOUNT,
created_by="system",
created_at=naive_utc_now(),
used=False,