Files
dify/web/app/components/evaluation/__tests__/index.spec.tsx

487 lines
17 KiB
TypeScript

import type { ReactNode } from 'react'
import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
import { act, fireEvent, render, screen, waitFor } from '@testing-library/react'
import Evaluation from '..'
import ConditionsSection from '../components/conditions-section'
import { useEvaluationStore } from '../store'
const mockUpload = vi.hoisted(() => vi.fn())
const mockUseAvailableEvaluationMetrics = vi.hoisted(() => vi.fn())
const mockUseEvaluationConfig = vi.hoisted(() => vi.fn())
const mockUseEvaluationNodeInfoMutation = vi.hoisted(() => vi.fn())
const mockUseSaveEvaluationConfigMutation = vi.hoisted(() => vi.fn())
const mockUseStartEvaluationRunMutation = vi.hoisted(() => vi.fn())
vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({
useModelList: () => ({
data: [{
provider: 'openai',
models: [{ model: 'gpt-4o-mini' }],
}],
}),
}))
vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({
default: ({
defaultModel,
onSelect,
}: {
defaultModel?: { provider: string, model: string }
onSelect: (model: { provider: string, model: string }) => void
}) => (
<div>
<div data-testid="evaluation-model-selector">
{defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
</div>
<button
type="button"
onClick={() => onSelect({ provider: 'openai', model: 'gpt-4o-mini' })}
>
select-model
</button>
</div>
),
}))
vi.mock('@/service/base', () => ({
upload: (...args: unknown[]) => mockUpload(...args),
}))
vi.mock('@/service/use-evaluation', () => ({
useEvaluationConfig: (...args: unknown[]) => mockUseEvaluationConfig(...args),
useAvailableEvaluationMetrics: (...args: unknown[]) => mockUseAvailableEvaluationMetrics(...args),
useEvaluationNodeInfoMutation: (...args: unknown[]) => mockUseEvaluationNodeInfoMutation(...args),
useSaveEvaluationConfigMutation: (...args: unknown[]) => mockUseSaveEvaluationConfigMutation(...args),
useStartEvaluationRunMutation: (...args: unknown[]) => mockUseStartEvaluationRunMutation(...args),
}))
vi.mock('@/service/use-workflow', () => ({
useAppWorkflow: () => ({
data: {
graph: {
nodes: [{
id: 'start',
data: {
type: 'start',
variables: [{
variable: 'query',
type: 'text-input',
}],
},
}],
},
},
isLoading: false,
}),
}))
vi.mock('@/service/use-snippet-workflows', () => ({
useSnippetPublishedWorkflow: () => ({
data: {
graph: {
nodes: [{
id: 'start',
data: {
type: 'start',
variables: [{
variable: 'query',
type: 'text-input',
}],
},
}],
},
},
isLoading: false,
}),
}))
const renderWithQueryClient = (ui: ReactNode) => {
const queryClient = new QueryClient({
defaultOptions: {
queries: {
retry: false,
},
mutations: {
retry: false,
},
},
})
return render(ui, {
wrapper: ({ children }: { children: ReactNode }) => (
<QueryClientProvider client={queryClient}>
{children}
</QueryClientProvider>
),
})
}
describe('Evaluation', () => {
beforeEach(() => {
useEvaluationStore.setState({ resources: {} })
vi.clearAllMocks()
mockUseEvaluationConfig.mockReturnValue({
data: null,
})
mockUseAvailableEvaluationMetrics.mockReturnValue({
data: {
metrics: ['answer-correctness', 'faithfulness', 'context-precision', 'context-recall', 'context-relevance'],
},
isLoading: false,
})
mockUseEvaluationNodeInfoMutation.mockReturnValue({
isPending: false,
mutate: (_input: unknown, options?: { onSuccess?: (data: Record<string, Array<{ node_id: string, title: string, type: string }>>) => void }) => {
options?.onSuccess?.({
'answer-correctness': [
{ node_id: 'node-answer', title: 'Answer Node', type: 'llm' },
],
'faithfulness': [
{ node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' },
],
})
},
})
mockUseSaveEvaluationConfigMutation.mockReturnValue({
isPending: false,
mutate: vi.fn(),
})
mockUseStartEvaluationRunMutation.mockReturnValue({
isPending: false,
mutate: vi.fn(),
})
mockUpload.mockResolvedValue({
id: 'uploaded-file-id',
name: 'evaluation.csv',
})
})
it('should search, select metric nodes, and save evaluation config', () => {
const saveConfig = vi.fn()
mockUseSaveEvaluationConfigMutation.mockReturnValue({
isPending: false,
mutate: saveConfig,
})
renderWithQueryClient(<Evaluation resourceType="apps" resourceId="app-1" />)
expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('openai:gpt-4o-mini')
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchNodeOrMetrics'), {
target: { value: 'does-not-exist' },
})
expect(screen.getByText('evaluation.metrics.noResults')).toBeInTheDocument()
fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchNodeOrMetrics'), {
target: { value: 'faith' },
})
fireEvent.click(screen.getByTestId('evaluation-metric-node-faithfulness-node-faithfulness'))
expect(screen.getAllByText('Faithfulness').length).toBeGreaterThan(0)
expect(screen.getAllByText('Retriever Node').length).toBeGreaterThan(0)
fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchNodeOrMetrics'), {
target: { value: '' },
})
fireEvent.click(screen.getByTestId('evaluation-metric-node-answer-correctness-node-answer'))
expect(screen.getAllByText('Answer Correctness').length).toBeGreaterThan(0)
fireEvent.click(screen.getByRole('button', { name: 'common.operation.save' }))
expect(saveConfig).toHaveBeenCalledWith({
params: {
targetType: 'apps',
targetId: 'app-1',
},
body: {
evaluation_model: 'gpt-4o-mini',
evaluation_model_provider: 'openai',
default_metrics: [
{
metric: 'faithfulness',
value_type: 'number',
node_info_list: [
{ node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' },
],
},
{
metric: 'answer-correctness',
value_type: 'number',
node_info_list: [
{ node_id: 'node-answer', title: 'Answer Node', type: 'llm' },
],
},
],
customized_metrics: null,
judgment_config: null,
},
}, {
onSuccess: expect.any(Function),
onError: expect.any(Function),
})
})
it('should hide the value row for empty operators', () => {
const resourceType = 'apps'
const resourceId = 'app-2'
const store = useEvaluationStore.getState()
let conditionId = ''
act(() => {
store.ensureResource(resourceType, resourceId)
store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
store.addBuiltinMetric(resourceType, resourceId, 'faithfulness', [
{ node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' },
])
store.addCondition(resourceType, resourceId)
const condition = useEvaluationStore.getState().resources['apps:app-2'].judgmentConfig.conditions[0]
conditionId = condition.id
store.updateConditionOperator(resourceType, resourceId, conditionId, '=')
})
let rerender: ReturnType<typeof render>['rerender']
act(() => {
({ rerender } = renderWithQueryClient(<Evaluation resourceType={resourceType} resourceId={resourceId} />))
})
expect(screen.getByPlaceholderText('evaluation.conditions.valuePlaceholder')).toBeInTheDocument()
act(() => {
store.updateConditionOperator(resourceType, resourceId, conditionId, 'is null')
rerender(<Evaluation resourceType={resourceType} resourceId={resourceId} />)
})
expect(screen.queryByPlaceholderText('evaluation.conditions.valuePlaceholder')).not.toBeInTheDocument()
})
it('should add a condition from grouped metric dropdown items', () => {
const resourceType = 'apps'
const resourceId = 'app-conditions-dropdown'
const store = useEvaluationStore.getState()
act(() => {
store.ensureResource(resourceType, resourceId)
store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
store.addBuiltinMetric(resourceType, resourceId, 'faithfulness', [
{ node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' },
])
store.addCustomMetric(resourceType, resourceId)
const customMetric = useEvaluationStore.getState().resources['apps:app-conditions-dropdown'].metrics.find(metric => metric.kind === 'custom-workflow')!
store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, {
workflowId: 'workflow-1',
workflowAppId: 'workflow-app-1',
workflowName: 'Review Workflow',
})
store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{
id: 'reason',
valueType: 'string',
}])
})
render(<ConditionsSection resourceType={resourceType} resourceId={resourceId} />)
fireEvent.click(screen.getByRole('combobox', { name: 'evaluation.conditions.addCondition' }))
expect(screen.getByText('Faithfulness')).toBeInTheDocument()
expect(screen.getByText('Review Workflow')).toBeInTheDocument()
expect(screen.getByText('Retriever Node')).toBeInTheDocument()
expect(screen.getByText('reason')).toBeInTheDocument()
expect(screen.getByText('evaluation.conditions.valueTypes.number')).toBeInTheDocument()
expect(screen.getByText('evaluation.conditions.valueTypes.string')).toBeInTheDocument()
fireEvent.click(screen.getByRole('option', { name: /reason/i }))
const condition = useEvaluationStore.getState().resources['apps:app-conditions-dropdown'].judgmentConfig.conditions[0]
expect(condition.variableSelector).toEqual(['workflow-1', 'reason'])
expect(screen.getAllByText('Review Workflow').length).toBeGreaterThan(0)
})
it('should render the metric no-node empty state', () => {
mockUseAvailableEvaluationMetrics.mockReturnValue({
data: {
metrics: ['context-precision'],
},
isLoading: false,
})
mockUseEvaluationNodeInfoMutation.mockReturnValue({
isPending: false,
mutate: (_input: unknown, options?: { onSuccess?: (data: Record<string, Array<{ node_id: string, title: string, type: string }>>) => void }) => {
options?.onSuccess?.({
'context-precision': [],
})
},
})
renderWithQueryClient(<Evaluation resourceType="apps" resourceId="app-3" />)
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
expect(screen.getByText('evaluation.metrics.noNodesInWorkflow')).toBeInTheDocument()
})
it('should render the global empty state when no metrics are available', () => {
mockUseAvailableEvaluationMetrics.mockReturnValue({
data: {
metrics: [],
},
isLoading: false,
})
renderWithQueryClient(<Evaluation resourceType="apps" resourceId="app-4" />)
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
expect(screen.getByText('evaluation.metrics.noResults')).toBeInTheDocument()
})
it('should show more nodes when a metric has more than three nodes', () => {
mockUseAvailableEvaluationMetrics.mockReturnValue({
data: {
metrics: ['answer-correctness'],
},
isLoading: false,
})
mockUseEvaluationNodeInfoMutation.mockReturnValue({
isPending: false,
mutate: (_input: unknown, options?: { onSuccess?: (data: Record<string, Array<{ node_id: string, title: string, type: string }>>) => void }) => {
options?.onSuccess?.({
'answer-correctness': [
{ node_id: 'node-1', title: 'LLM 1', type: 'llm' },
{ node_id: 'node-2', title: 'LLM 2', type: 'llm' },
{ node_id: 'node-3', title: 'LLM 3', type: 'llm' },
{ node_id: 'node-4', title: 'LLM 4', type: 'llm' },
],
})
},
})
renderWithQueryClient(<Evaluation resourceType="apps" resourceId="app-5" />)
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
expect(screen.getByText('LLM 3')).toBeInTheDocument()
expect(screen.queryByText('LLM 4')).not.toBeInTheDocument()
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.showMore' }))
expect(screen.getByText('LLM 4')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.metrics.showLess' })).toBeInTheDocument()
})
it('should render the pipeline-specific layout without auto-selecting a judge model', () => {
renderWithQueryClient(<Evaluation resourceType="datasets" resourceId="dataset-1" />)
expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('empty')
expect(screen.getByText('evaluation.history.columns.time')).toBeInTheDocument()
expect(screen.getByText('Context Precision')).toBeInTheDocument()
expect(screen.getByText('Context Recall')).toBeInTheDocument()
expect(screen.getByText('Context Relevance')).toBeInTheDocument()
expect(screen.getByText('evaluation.results.empty')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
})
it('should render selected pipeline metrics from config with the default threshold input', () => {
mockUseEvaluationConfig.mockReturnValue({
data: {
evaluation_model: null,
evaluation_model_provider: null,
default_metrics: [{
metric: 'context-precision',
}],
customized_metrics: null,
judgment_config: null,
},
})
renderWithQueryClient(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
expect(screen.getByText('Context Precision')).toBeInTheDocument()
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
})
it('should enable pipeline batch actions after selecting a judge model and metric', () => {
renderWithQueryClient(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
})
it('should upload and start a pipeline evaluation run', async () => {
const startRun = vi.fn()
mockUseStartEvaluationRunMutation.mockReturnValue({
isPending: false,
mutate: startRun,
})
mockUpload.mockResolvedValue({
id: 'file-1',
name: 'pipeline-evaluation.csv',
})
renderWithQueryClient(<Evaluation resourceType="datasets" resourceId="dataset-run" />)
fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
fireEvent.click(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' }))
expect(screen.getAllByText('query').length).toBeGreaterThan(0)
expect(screen.getAllByText('Expect Results').length).toBeGreaterThan(0)
const fileInput = document.querySelector<HTMLInputElement>('input[type="file"][accept=".csv,.xlsx"]')
expect(fileInput).toBeInTheDocument()
fireEvent.change(fileInput!, {
target: {
files: [new File(['query,Expect Results'], 'pipeline-evaluation.csv', { type: 'text/csv' })],
},
})
await waitFor(() => {
expect(mockUpload).toHaveBeenCalledWith({
xhr: expect.any(XMLHttpRequest),
data: expect.any(FormData),
})
})
fireEvent.click(screen.getByRole('button', { name: 'evaluation.batch.run' }))
await waitFor(() => {
expect(startRun).toHaveBeenCalledWith({
params: {
targetType: 'datasets',
targetId: 'dataset-run',
},
body: {
evaluation_model: 'gpt-4o-mini',
evaluation_model_provider: 'openai',
default_metrics: [{
metric: 'context-precision',
value_type: 'number',
node_info_list: [],
}],
customized_metrics: null,
judgment_config: null,
file_id: 'file-1',
},
}, {
onSuccess: expect.any(Function),
onError: expect.any(Function),
})
})
})
})