mirror of
https://github.com/langgenius/dify.git
synced 2026-05-04 09:28:04 +08:00
185 lines
6.3 KiB
TypeScript
185 lines
6.3 KiB
TypeScript
import type {
|
|
ComparisonOperator,
|
|
EvaluationFieldOption,
|
|
EvaluationMockConfig,
|
|
EvaluationResourceType,
|
|
MetricOption,
|
|
} from './types'
|
|
|
|
const judgeModels = [
|
|
{
|
|
id: 'gpt-4.1-mini',
|
|
label: 'GPT-4.1 mini',
|
|
provider: 'OpenAI',
|
|
},
|
|
{
|
|
id: 'claude-3-7-sonnet',
|
|
label: 'Claude 3.7 Sonnet',
|
|
provider: 'Anthropic',
|
|
},
|
|
{
|
|
id: 'gemini-2.0-flash',
|
|
label: 'Gemini 2.0 Flash',
|
|
provider: 'Google',
|
|
},
|
|
]
|
|
|
|
const builtinMetrics: MetricOption[] = [
|
|
{
|
|
id: 'answer-correctness',
|
|
label: 'Answer Correctness',
|
|
description: 'Compares the response with the expected answer and scores factual alignment.',
|
|
group: 'quality',
|
|
badges: ['LLM', 'Built-in'],
|
|
},
|
|
{
|
|
id: 'faithfulness',
|
|
label: 'Faithfulness',
|
|
description: 'Checks whether the answer stays grounded in the retrieved evidence.',
|
|
group: 'quality',
|
|
badges: ['LLM', 'Retrieval'],
|
|
},
|
|
{
|
|
id: 'relevance',
|
|
label: 'Relevance',
|
|
description: 'Evaluates how directly the answer addresses the original request.',
|
|
group: 'quality',
|
|
badges: ['LLM'],
|
|
},
|
|
{
|
|
id: 'latency',
|
|
label: 'Latency',
|
|
description: 'Captures runtime responsiveness for the full execution path.',
|
|
group: 'operations',
|
|
badges: ['System'],
|
|
},
|
|
{
|
|
id: 'token-usage',
|
|
label: 'Token Usage',
|
|
description: 'Tracks prompt and completion token consumption for the run.',
|
|
group: 'operations',
|
|
badges: ['System'],
|
|
},
|
|
{
|
|
id: 'tool-success-rate',
|
|
label: 'Tool Success Rate',
|
|
description: 'Measures whether each required tool invocation finishes without failure.',
|
|
group: 'operations',
|
|
badges: ['Workflow'],
|
|
},
|
|
]
|
|
|
|
const workflowOptions = [
|
|
{
|
|
id: 'workflow-precision-review',
|
|
label: 'Precision Review Workflow',
|
|
description: 'Custom evaluator for nuanced quality review.',
|
|
targetVariables: [
|
|
{ id: 'query', label: 'query' },
|
|
{ id: 'answer', label: 'answer' },
|
|
{ id: 'reference', label: 'reference' },
|
|
],
|
|
},
|
|
{
|
|
id: 'workflow-risk-review',
|
|
label: 'Risk Review Workflow',
|
|
description: 'Custom evaluator for policy and escalation checks.',
|
|
targetVariables: [
|
|
{ id: 'input', label: 'input' },
|
|
{ id: 'output', label: 'output' },
|
|
],
|
|
},
|
|
]
|
|
|
|
const workflowFields: EvaluationFieldOption[] = [
|
|
{ id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' },
|
|
{ id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] },
|
|
{ id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' },
|
|
{ id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' },
|
|
{ id: 'app.output.published_at', label: 'Publication Date', group: 'App Output', type: 'time' },
|
|
{ id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' },
|
|
]
|
|
|
|
const pipelineFields: EvaluationFieldOption[] = [
|
|
{ id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' },
|
|
{ id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' },
|
|
{ id: 'dataset.input.updated_at', label: 'Updated At', group: 'Dataset', type: 'time' },
|
|
{ id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' },
|
|
{ id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] },
|
|
{ id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' },
|
|
]
|
|
|
|
const snippetFields: EvaluationFieldOption[] = [
|
|
{ id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' },
|
|
{ id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' },
|
|
{ id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' },
|
|
{ id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' },
|
|
{ id: 'snippet.output.scheduled_at', label: 'Scheduled At', group: 'Snippet Output', type: 'time' },
|
|
{ id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' },
|
|
]
|
|
|
|
export const getComparisonOperators = (fieldType: EvaluationFieldOption['type']): ComparisonOperator[] => {
|
|
if (fieldType === 'number')
|
|
return ['is', 'is_not', 'greater_than', 'less_than', 'greater_or_equal', 'less_or_equal', 'is_empty', 'is_not_empty']
|
|
|
|
if (fieldType === 'time')
|
|
return ['is', 'before', 'after', 'is_empty', 'is_not_empty']
|
|
|
|
if (fieldType === 'boolean' || fieldType === 'enum')
|
|
return ['is', 'is_not']
|
|
|
|
return ['contains', 'not_contains', 'is', 'is_not', 'is_empty', 'is_not_empty']
|
|
}
|
|
|
|
export const getDefaultOperator = (fieldType: EvaluationFieldOption['type']): ComparisonOperator => {
|
|
return getComparisonOperators(fieldType)[0]
|
|
}
|
|
|
|
export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => {
|
|
if (resourceType === 'pipeline') {
|
|
return {
|
|
judgeModels,
|
|
builtinMetrics,
|
|
workflowOptions,
|
|
fieldOptions: pipelineFields,
|
|
templateFileName: 'pipeline-evaluation-template.csv',
|
|
batchRequirements: [
|
|
'Include one row per retrieval scenario.',
|
|
'Provide the expected source or target chunk for each case.',
|
|
'Keep numeric metrics in plain number format.',
|
|
],
|
|
historySummaryLabel: 'Pipeline evaluation batch',
|
|
}
|
|
}
|
|
|
|
if (resourceType === 'snippet') {
|
|
return {
|
|
judgeModels,
|
|
builtinMetrics,
|
|
workflowOptions,
|
|
fieldOptions: snippetFields,
|
|
templateFileName: 'snippet-evaluation-template.csv',
|
|
batchRequirements: [
|
|
'Include one row per snippet execution case.',
|
|
'Provide the expected final content or acceptance rule.',
|
|
'Keep optional fields empty when not used.',
|
|
],
|
|
historySummaryLabel: 'Snippet evaluation batch',
|
|
}
|
|
}
|
|
|
|
return {
|
|
judgeModels,
|
|
builtinMetrics,
|
|
workflowOptions,
|
|
fieldOptions: workflowFields,
|
|
templateFileName: 'workflow-evaluation-template.csv',
|
|
batchRequirements: [
|
|
'Include one row per workflow test case.',
|
|
'Provide both user input and expected answer when available.',
|
|
'Keep boolean columns as true or false.',
|
|
],
|
|
historySummaryLabel: 'Workflow evaluation batch',
|
|
}
|
|
}
|