mirror of
https://github.com/langgenius/dify.git
synced 2026-05-02 08:28:03 +08:00
evaluation runtime
This commit is contained in:
@ -41,14 +41,6 @@ class EvaluationItemResult(BaseModel):
|
||||
metrics: list[EvaluationMetric] = Field(default_factory=list)
|
||||
error: str | None = None
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float | None:
|
||||
if not self.metrics:
|
||||
return None
|
||||
scores = [m.score for m in self.metrics]
|
||||
return sum(scores) / len(scores)
|
||||
|
||||
|
||||
class NodeInfo(BaseModel):
|
||||
node_id: str
|
||||
type: str
|
||||
|
||||
@ -15,7 +15,6 @@ from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationCategory,
|
||||
EvaluationConfigData,
|
||||
EvaluationDatasetInput,
|
||||
EvaluationItemInput,
|
||||
EvaluationRunData,
|
||||
EvaluationRunRequest,
|
||||
)
|
||||
@ -156,6 +155,8 @@ class EvaluationService:
|
||||
"""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
if ws is None:
|
||||
ws = wb.create_sheet("Evaluation Dataset")
|
||||
|
||||
sheet_name = "Evaluation Dataset"
|
||||
ws.title = sheet_name
|
||||
@ -174,7 +175,7 @@ class EvaluationService:
|
||||
headers = ["index"]
|
||||
|
||||
for field in input_fields:
|
||||
field_label = field.get("label") or field.get("variable")
|
||||
field_label = str(field.get("label") or field.get("variable") or "")
|
||||
headers.append(field_label)
|
||||
|
||||
# Write header row
|
||||
@ -279,9 +280,6 @@ class EvaluationService:
|
||||
if evaluation_instance is None:
|
||||
raise EvaluationFrameworkNotConfiguredError()
|
||||
|
||||
# Derive evaluation_category from default_metrics node types
|
||||
evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics)
|
||||
|
||||
# Save as latest EvaluationConfiguration
|
||||
config = cls.save_evaluation_config(
|
||||
session=session,
|
||||
@ -333,12 +331,10 @@ class EvaluationService:
|
||||
target_id=target_id,
|
||||
evaluation_model_provider=run_request.evaluation_model_provider,
|
||||
evaluation_model=run_request.evaluation_model,
|
||||
default_metrics=[m.model_dump() for m in run_request.default_metrics],
|
||||
customized_metrics=(
|
||||
run_request.customized_metrics.model_dump() if run_request.customized_metrics else None
|
||||
),
|
||||
default_metrics=run_request.default_metrics,
|
||||
customized_metrics=run_request.customized_metrics,
|
||||
judgment_config=run_request.judgment_config,
|
||||
items=items,
|
||||
input_list=items,
|
||||
)
|
||||
|
||||
# Dispatch Celery task
|
||||
@ -648,7 +644,7 @@ class EvaluationService:
|
||||
# ---- Dataset Parsing ----
|
||||
|
||||
@classmethod
|
||||
def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationItemInput]:
|
||||
def _parse_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]:
|
||||
"""Parse evaluation dataset from XLSX bytes."""
|
||||
wb = load_workbook(io.BytesIO(xlsx_content), read_only=True)
|
||||
ws = wb.active
|
||||
@ -672,7 +668,7 @@ class EvaluationService:
|
||||
|
||||
index_val = values[0] if values else row_idx
|
||||
try:
|
||||
index = int(index_val)
|
||||
index = int(str(index_val))
|
||||
except (TypeError, ValueError):
|
||||
index = row_idx
|
||||
|
||||
@ -681,17 +677,14 @@ class EvaluationService:
|
||||
val = values[col_idx + 1] if col_idx + 1 < len(values) else None
|
||||
inputs[header] = str(val) if val is not None else ""
|
||||
|
||||
# Check for expected_output column
|
||||
# Extract expected_output column into dedicated field
|
||||
expected_output = inputs.pop("expected_output", None)
|
||||
context_str = inputs.pop("context", None)
|
||||
context = context_str.split(";") if context_str else None
|
||||
|
||||
items.append(
|
||||
EvaluationItemInput(
|
||||
EvaluationDatasetInput(
|
||||
index=index,
|
||||
inputs=inputs,
|
||||
expected_output=expected_output,
|
||||
context=context,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
|
||||
from core.workflow.node_events.base import NodeRunResult
|
||||
from extensions.ext_database import db
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models.enums import CreatorUserRole
|
||||
from models.evaluation import EvaluationRun, EvaluationRunStatus
|
||||
from models.model import UploadFile
|
||||
from services.evaluation_service import EvaluationService
|
||||
@ -116,6 +117,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
|
||||
logger.info("Evaluation run %s completed successfully", run_data.evaluation_run_id)
|
||||
|
||||
|
||||
def _execute_evaluation_runner(
|
||||
session: Any,
|
||||
run_data: EvaluationRunData,
|
||||
@ -125,6 +127,7 @@ def _execute_evaluation_runner(
|
||||
"""Execute the evaluation runner."""
|
||||
default_metrics = run_data.default_metrics
|
||||
customized_metrics = run_data.customized_metrics
|
||||
results: list[EvaluationItemResult] = []
|
||||
for default_metric in default_metrics:
|
||||
for node_info in default_metric.node_info_list:
|
||||
node_run_result_list: list[NodeRunResult] = []
|
||||
@ -134,7 +137,7 @@ def _execute_evaluation_runner(
|
||||
node_run_result_list.append(node_run_result)
|
||||
if node_run_result_list:
|
||||
runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session)
|
||||
runner.run(
|
||||
results.extend(runner.run(
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_id=run_data.target_id,
|
||||
@ -144,10 +147,10 @@ def _execute_evaluation_runner(
|
||||
model_provider=run_data.evaluation_model_provider,
|
||||
model_name=run_data.evaluation_model,
|
||||
node_run_result_list=node_run_result_list,
|
||||
)
|
||||
))
|
||||
if customized_metrics:
|
||||
runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session)
|
||||
runner.run(
|
||||
results.extend(runner.run(
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_id=run_data.target_id,
|
||||
@ -156,7 +159,9 @@ def _execute_evaluation_runner(
|
||||
customized_metrics=customized_metrics,
|
||||
node_run_result_list=None,
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
)
|
||||
))
|
||||
return results
|
||||
|
||||
|
||||
def _create_runner(
|
||||
category: EvaluationCategory,
|
||||
@ -201,7 +206,7 @@ def _compute_metrics_summary(results: list[EvaluationItemResult]) -> dict[str, A
|
||||
for metric in result.metrics:
|
||||
if metric.name not in metric_scores:
|
||||
metric_scores[metric.name] = []
|
||||
metric_scores[metric.name].append(metric.score)
|
||||
metric_scores[metric.name].append(float(metric.value))
|
||||
|
||||
summary: dict[str, Any] = {}
|
||||
for name, scores in metric_scores.items():
|
||||
@ -231,6 +236,8 @@ def _generate_result_xlsx(
|
||||
"""Generate result XLSX with input data, actual output, and metric scores."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
if ws is None:
|
||||
ws = wb.create_sheet("Evaluation Results")
|
||||
ws.title = "Evaluation Results"
|
||||
|
||||
header_font = Font(bold=True, color="FFFFFF")
|
||||
@ -306,7 +313,7 @@ def _generate_result_xlsx(
|
||||
col += 1
|
||||
|
||||
# Metric scores
|
||||
metric_scores = {m.name: m.score for m in result.metrics} if result else {}
|
||||
metric_scores = {m.name: m.value for m in result.metrics} if result else {}
|
||||
for metric_name in all_metric_names:
|
||||
score = metric_scores.get(metric_name)
|
||||
ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
|
||||
@ -351,7 +358,7 @@ def _store_result_file(
|
||||
size=len(xlsx_content),
|
||||
extension="xlsx",
|
||||
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
created_by_role="account",
|
||||
created_by_role=CreatorUserRole.ACCOUNT,
|
||||
created_by="system",
|
||||
created_at=naive_utc_now(),
|
||||
used=False,
|
||||
|
||||
Reference in New Issue
Block a user