Merge remote-tracking branch 'origin/feat/evaluation' into feat/evaluation

This commit is contained in:
FFXN
2026-03-05 13:38:35 +08:00
4 changed files with 145 additions and 49 deletions

View File

@ -20,9 +20,10 @@ from controllers.console.wraps import (
edit_permission_required,
setup_required,
)
from core.evaluation.entities.evaluation_entity import EvaluationCategory
from core.evaluation.entities.evaluation_entity import EvaluationCategory, EvaluationConfigData, EvaluationRunRequest
from core.workflow.file import helpers as file_helpers
from extensions.ext_database import db
from extensions.ext_storage import storage
from libs.helper import TimestampField
from libs.login import current_account_with_tenant, login_required
from models import App
@ -260,7 +261,12 @@ class EvaluationDetailApi(Resource):
Save evaluation configuration for the target.
"""
current_account, current_tenant_id = current_account_with_tenant()
data = request.get_json(force=True)
body = request.get_json(force=True)
try:
config_data = EvaluationConfigData.model_validate(body)
except Exception as e:
raise BadRequest(f"Invalid request body: {e}")
with Session(db.engine, expire_on_commit=False) as session:
config = EvaluationService.save_evaluation_config(
@ -269,7 +275,7 @@ class EvaluationDetailApi(Resource):
target_type=target_type,
target_id=str(target.id),
account_id=str(current_account.id),
data=data,
data=config_data,
)
return {
@ -332,33 +338,43 @@ class EvaluationRunApi(Resource):
"""
Start an evaluation run.
Expects multipart form data with:
- file: XLSX dataset file
- evaluation_category: one of llm, retrieval, agent, workflow
Expects JSON body with:
- file_id: uploaded dataset file ID
- evaluation_model: evaluation model name
- evaluation_model_provider: evaluation model provider
- default_metrics: list of default metric objects
- customized_metrics: customized metrics object (optional)
- judgment_config: judgment conditions config (optional)
"""
current_account, current_tenant_id = current_account_with_tenant()
# Validate file upload
if "file" not in request.files:
raise BadRequest("Dataset file is required.")
file = request.files["file"]
if not file.filename or not file.filename.endswith(".xlsx"):
raise BadRequest("Dataset file must be an XLSX file.")
body = request.get_json(force=True)
if not body:
raise BadRequest("Request body is required.")
# Validate and parse request body
try:
run_request = EvaluationRunRequest.model_validate(body)
except Exception as e:
raise BadRequest(f"Invalid request body: {e}")
# Load dataset file
upload_file = (
db.session.query(UploadFile)
.filter_by(id=run_request.file_id, tenant_id=current_tenant_id)
.first()
)
if not upload_file:
raise NotFound("Dataset file not found.")
try:
dataset_content = storage.load_once(upload_file.key)
except Exception:
raise BadRequest("Failed to read dataset file.")
dataset_content = file.read()
if not dataset_content:
raise BadRequest("Dataset file is empty.")
# Validate evaluation category
category_str = request.form.get("evaluation_category", "llm")
try:
evaluation_category = EvaluationCategory(category_str)
except ValueError:
raise BadRequest(
f"Invalid evaluation_category: {category_str}. "
f"Must be one of: {', '.join(e.value for e in EvaluationCategory)}"
)
try:
with Session(db.engine, expire_on_commit=False) as session:
evaluation_run = EvaluationService.start_evaluation_run(
@ -368,7 +384,7 @@ class EvaluationRunApi(Resource):
target_id=str(target.id),
account_id=str(current_account.id),
dataset_file_content=dataset_content,
evaluation_category=evaluation_category,
run_request=run_request,
)
return _serialize_evaluation_run(evaluation_run), 200
except EvaluationFrameworkNotConfiguredError as e:

View File

@ -8,9 +8,10 @@ from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentRes
class EvaluationCategory(StrEnum):
LLM = "llm"
RETRIEVAL = "retrieval"
RETRIEVAL = "knowledge_retrieval"
AGENT = "agent"
WORKFLOW = "workflow"
RETRIEVAL_TEST = "retrieval_test"
class EvaluationMetric(BaseModel):
@ -42,6 +43,42 @@ class EvaluationItemResult(BaseModel):
return sum(scores) / len(scores)
class NodeInfo(BaseModel):
node_id: str
type: str
title: str
class DefaultMetric(BaseModel):
metric: str
node_info_list: list[NodeInfo]
class CustomizedMetricOutputField(BaseModel):
variable: str
value_type: str
class CustomizedMetrics(BaseModel):
evaluation_workflow_id: str
input_fields: dict[str, str]
output_fields: list[CustomizedMetricOutputField]
class EvaluationConfigData(BaseModel):
"""Structured data for saving evaluation configuration."""
evaluation_model: str = ""
evaluation_model_provider: str = ""
default_metrics: list[DefaultMetric] = Field(default_factory=list)
customized_metrics: CustomizedMetrics | None = None
judgment_config: JudgmentConfig | None = None
class EvaluationRunRequest(EvaluationConfigData):
"""Request body for starting an evaluation run."""
file_id: str
class EvaluationRunData(BaseModel):
"""Serializable data for Celery task."""
evaluation_run_id: str
@ -51,6 +88,7 @@ class EvaluationRunData(BaseModel):
evaluation_category: EvaluationCategory
evaluation_model_provider: str
evaluation_model: str
metrics_config: dict[str, Any] = Field(default_factory=dict)
default_metrics: list[dict[str, Any]] = Field(default_factory=list)
customized_metrics: dict[str, Any] | None = None
judgment_config: JudgmentConfig | None = None
items: list[EvaluationItemInput]

View File

@ -10,9 +10,12 @@ from sqlalchemy.orm import Session
from configs import dify_config
from core.evaluation.entities.evaluation_entity import (
DefaultMetric,
EvaluationCategory,
EvaluationConfigData,
EvaluationItemInput,
EvaluationRunData,
EvaluationRunRequest,
)
from core.evaluation.evaluation_manager import EvaluationManager
from models.evaluation import (
@ -222,7 +225,7 @@ class EvaluationService:
target_type: str,
target_id: str,
account_id: str,
data: dict[str, Any],
data: EvaluationConfigData,
) -> EvaluationConfiguration:
config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
if config is None:
@ -235,10 +238,15 @@ class EvaluationService:
)
session.add(config)
config.evaluation_model_provider = data.get("evaluation_model_provider")
config.evaluation_model = data.get("evaluation_model")
config.metrics_config = json.dumps(data.get("metrics_config", {}))
config.judgement_conditions = json.dumps(data.get("judgement_conditions", {}))
config.evaluation_model_provider = data.evaluation_model_provider
config.evaluation_model = data.evaluation_model
config.metrics_config = json.dumps({
"default_metrics": [m.model_dump() for m in data.default_metrics],
"customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None,
})
config.judgement_conditions = json.dumps(
data.judgment_config.model_dump() if data.judgment_config else {}
)
config.updated_by = account_id
session.commit()
session.refresh(config)
@ -255,18 +263,30 @@ class EvaluationService:
target_id: str,
account_id: str,
dataset_file_content: bytes,
evaluation_category: EvaluationCategory,
run_request: EvaluationRunRequest,
) -> EvaluationRun:
"""Validate dataset, create run record, dispatch Celery task."""
"""Validate dataset, create run record, dispatch Celery task.
Saves the provided parameters as the latest EvaluationConfiguration
before creating the run.
"""
# Check framework is configured
evaluation_instance = EvaluationManager.get_evaluation_instance()
if evaluation_instance is None:
raise EvaluationFrameworkNotConfiguredError()
# Check evaluation config exists
config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
if config is None:
raise EvaluationNotFoundError("Evaluation configuration not found. Please configure evaluation first.")
# Derive evaluation_category from default_metrics node types
evaluation_category = cls._resolve_evaluation_category(run_request.default_metrics)
# Save as latest EvaluationConfiguration
config = cls.save_evaluation_config(
session=session,
tenant_id=tenant_id,
target_type=target_type,
target_id=target_id,
account_id=account_id,
data=run_request,
)
# Check concurrent run limit
active_runs = (
@ -308,9 +328,13 @@ class EvaluationService:
target_type=target_type,
target_id=target_id,
evaluation_category=evaluation_category,
evaluation_model_provider=config.evaluation_model_provider or "",
evaluation_model=config.evaluation_model or "",
metrics_config=config.metrics_config_dict,
evaluation_model_provider=run_request.evaluation_model_provider,
evaluation_model=run_request.evaluation_model,
default_metrics=[m.model_dump() for m in run_request.default_metrics],
customized_metrics=(
run_request.customized_metrics.model_dump() if run_request.customized_metrics else None
),
judgment_config=run_request.judgment_config,
items=items,
)
@ -406,6 +430,23 @@ class EvaluationService:
def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]:
return EvaluationManager.get_supported_metrics(category)
# ---- Category Resolution ----
@classmethod
def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory:
"""Derive evaluation category from default_metrics node_info types.
Uses the type of the first node_info found in default_metrics.
Falls back to LLM if no metrics are provided.
"""
for metric in default_metrics:
for node_info in metric.node_info_list:
try:
return EvaluationCategory(node_info.type)
except ValueError:
continue
return EvaluationCategory.LLM
# ---- Dataset Parsing ----
@classmethod

View File

@ -1,6 +1,8 @@
import io
import json
import logging
from configs import dify_config
from models.model import UploadFile
from typing import Any
from celery import shared_task
@ -80,7 +82,8 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
target_id=run_data.target_id,
target_type=run_data.target_type,
items=run_data.items,
metrics_config=run_data.metrics_config,
default_metrics=run_data.default_metrics,
customized_metrics=run_data.customized_metrics,
model_provider=run_data.evaluation_model_provider,
model_name=run_data.evaluation_model,
)
@ -279,20 +282,16 @@ def _store_result_file(
"""Store result XLSX file and return the UploadFile ID."""
try:
from extensions.ext_storage import storage
from models.model import UploadFile
from libs.uuid_utils import uuidv7
file_id = str(uuidv7())
filename = f"evaluation-result-{run_id[:8]}.xlsx"
storage_key = f"evaluation_results/{tenant_id}/{file_id}.xlsx"
storage_key = f"evaluation_results/{tenant_id}/{str(uuidv7())}.xlsx"
storage.save(storage_key, xlsx_content)
upload_file = UploadFile(
id=file_id,
upload_file: UploadFile = UploadFile(
tenant_id=tenant_id,
storage_type="evaluation_result",
storage_type=dify_config.STORAGE_TYPE,
key=storage_key,
name=filename,
size=len(xlsx_content),
@ -300,10 +299,12 @@ def _store_result_file(
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
created_by_role="account",
created_by="system",
created_at=naive_utc_now(),
used=False,
)
session.add(upload_file)
session.commit()
return file_id
return upload_file.id
except Exception:
logger.exception("Failed to store result file for run %s", run_id)
return None