Files
dify/web/app/components/evaluation/mock.ts
2026-03-16 10:46:33 +08:00

185 lines
6.3 KiB
TypeScript

import type {
ComparisonOperator,
EvaluationFieldOption,
EvaluationMockConfig,
EvaluationResourceType,
MetricOption,
} from './types'
const judgeModels = [
{
id: 'gpt-4.1-mini',
label: 'GPT-4.1 mini',
provider: 'OpenAI',
},
{
id: 'claude-3-7-sonnet',
label: 'Claude 3.7 Sonnet',
provider: 'Anthropic',
},
{
id: 'gemini-2.0-flash',
label: 'Gemini 2.0 Flash',
provider: 'Google',
},
]
const builtinMetrics: MetricOption[] = [
{
id: 'answer-correctness',
label: 'Answer Correctness',
description: 'Compares the response with the expected answer and scores factual alignment.',
group: 'quality',
badges: ['LLM', 'Built-in'],
},
{
id: 'faithfulness',
label: 'Faithfulness',
description: 'Checks whether the answer stays grounded in the retrieved evidence.',
group: 'quality',
badges: ['LLM', 'Retrieval'],
},
{
id: 'relevance',
label: 'Relevance',
description: 'Evaluates how directly the answer addresses the original request.',
group: 'quality',
badges: ['LLM'],
},
{
id: 'latency',
label: 'Latency',
description: 'Captures runtime responsiveness for the full execution path.',
group: 'operations',
badges: ['System'],
},
{
id: 'token-usage',
label: 'Token Usage',
description: 'Tracks prompt and completion token consumption for the run.',
group: 'operations',
badges: ['System'],
},
{
id: 'tool-success-rate',
label: 'Tool Success Rate',
description: 'Measures whether each required tool invocation finishes without failure.',
group: 'operations',
badges: ['Workflow'],
},
]
const workflowOptions = [
{
id: 'workflow-precision-review',
label: 'Precision Review Workflow',
description: 'Custom evaluator for nuanced quality review.',
targetVariables: [
{ id: 'query', label: 'query' },
{ id: 'answer', label: 'answer' },
{ id: 'reference', label: 'reference' },
],
},
{
id: 'workflow-risk-review',
label: 'Risk Review Workflow',
description: 'Custom evaluator for policy and escalation checks.',
targetVariables: [
{ id: 'input', label: 'input' },
{ id: 'output', label: 'output' },
],
},
]
const workflowFields: EvaluationFieldOption[] = [
{ id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' },
{ id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] },
{ id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' },
{ id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' },
{ id: 'app.output.published_at', label: 'Publication Date', group: 'App Output', type: 'time' },
{ id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' },
]
const pipelineFields: EvaluationFieldOption[] = [
{ id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' },
{ id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' },
{ id: 'dataset.input.updated_at', label: 'Updated At', group: 'Dataset', type: 'time' },
{ id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' },
{ id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] },
{ id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' },
]
const snippetFields: EvaluationFieldOption[] = [
{ id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' },
{ id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' },
{ id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' },
{ id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' },
{ id: 'snippet.output.scheduled_at', label: 'Scheduled At', group: 'Snippet Output', type: 'time' },
{ id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' },
]
export const getComparisonOperators = (fieldType: EvaluationFieldOption['type']): ComparisonOperator[] => {
if (fieldType === 'number')
return ['is', 'is_not', 'greater_than', 'less_than', 'greater_or_equal', 'less_or_equal', 'is_empty', 'is_not_empty']
if (fieldType === 'time')
return ['is', 'before', 'after', 'is_empty', 'is_not_empty']
if (fieldType === 'boolean' || fieldType === 'enum')
return ['is', 'is_not']
return ['contains', 'not_contains', 'is', 'is_not', 'is_empty', 'is_not_empty']
}
export const getDefaultOperator = (fieldType: EvaluationFieldOption['type']): ComparisonOperator => {
return getComparisonOperators(fieldType)[0]
}
export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => {
if (resourceType === 'pipeline') {
return {
judgeModels,
builtinMetrics,
workflowOptions,
fieldOptions: pipelineFields,
templateFileName: 'pipeline-evaluation-template.csv',
batchRequirements: [
'Include one row per retrieval scenario.',
'Provide the expected source or target chunk for each case.',
'Keep numeric metrics in plain number format.',
],
historySummaryLabel: 'Pipeline evaluation batch',
}
}
if (resourceType === 'snippet') {
return {
judgeModels,
builtinMetrics,
workflowOptions,
fieldOptions: snippetFields,
templateFileName: 'snippet-evaluation-template.csv',
batchRequirements: [
'Include one row per snippet execution case.',
'Provide the expected final content or acceptance rule.',
'Keep optional fields empty when not used.',
],
historySummaryLabel: 'Snippet evaluation batch',
}
}
return {
judgeModels,
builtinMetrics,
workflowOptions,
fieldOptions: workflowFields,
templateFileName: 'workflow-evaluation-template.csv',
batchRequirements: [
'Include one row per workflow test case.',
'Provide both user input and expected answer when available.',
'Keep boolean columns as true or false.',
],
historySummaryLabel: 'Workflow evaluation batch',
}
}