feat: implement document creation pipeline with multi-step wizard and datasource management (#30843)

Co-authored-by: CodingOnStar <hanxujiang@dify.ai>
This commit is contained in:
Coding On Star
2026-01-15 10:33:48 +08:00
committed by GitHub
parent d3923e7b56
commit a33ac77a22
15 changed files with 5783 additions and 577 deletions

View File

@ -0,0 +1,5 @@
export { useAddDocumentsSteps } from './use-add-documents-steps'
export { useDatasourceActions } from './use-datasource-actions'
export { useDatasourceOptions } from './use-datasource-options'
export { useLocalFile, useOnlineDocument, useOnlineDrive, useWebsiteCrawl } from './use-datasource-store'
export { useDatasourceUIState } from './use-datasource-ui-state'

View File

@ -0,0 +1,41 @@
import { useCallback, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { AddDocumentsStep } from '../types'
/**
* Hook for managing add documents wizard steps
*/
export const useAddDocumentsSteps = () => {
const { t } = useTranslation()
const [currentStep, setCurrentStep] = useState(1)
const handleNextStep = useCallback(() => {
setCurrentStep(preStep => preStep + 1)
}, [])
const handleBackStep = useCallback(() => {
setCurrentStep(preStep => preStep - 1)
}, [])
const steps = [
{
label: t('addDocuments.steps.chooseDatasource', { ns: 'datasetPipeline' }),
value: AddDocumentsStep.dataSource,
},
{
label: t('addDocuments.steps.processDocuments', { ns: 'datasetPipeline' }),
value: AddDocumentsStep.processDocuments,
},
{
label: t('addDocuments.steps.processingDocuments', { ns: 'datasetPipeline' }),
value: AddDocumentsStep.processingDocuments,
},
]
return {
steps,
currentStep,
handleNextStep,
handleBackStep,
}
}

View File

@ -0,0 +1,321 @@
import type { StoreApi } from 'zustand'
import type { DataSourceShape } from '@/app/components/datasets/documents/create-from-pipeline/data-source/store'
import type { Datasource } from '@/app/components/rag-pipeline/components/panel/test-run/types'
import type { DataSourceNotionPageMap, NotionPage } from '@/models/common'
import type { CrawlResultItem, DocumentItem, CustomFile as File, FileIndexingEstimateResponse } from '@/models/datasets'
import type {
OnlineDriveFile,
PublishedPipelineRunPreviewResponse,
PublishedPipelineRunResponse,
} from '@/models/pipeline'
import { useCallback, useRef } from 'react'
import { trackEvent } from '@/app/components/base/amplitude'
import { DatasourceType } from '@/models/pipeline'
import { useRunPublishedPipeline } from '@/service/use-pipeline'
import {
buildLocalFileDatasourceInfo,
buildOnlineDocumentDatasourceInfo,
buildOnlineDriveDatasourceInfo,
buildWebsiteCrawlDatasourceInfo,
} from '../utils/datasource-info-builder'
type DatasourceActionsParams = {
datasource: Datasource | undefined
datasourceType: string | undefined
pipelineId: string | undefined
dataSourceStore: StoreApi<DataSourceShape>
setEstimateData: (data: FileIndexingEstimateResponse | undefined) => void
setBatchId: (id: string) => void
setDocuments: (docs: PublishedPipelineRunResponse['documents']) => void
handleNextStep: () => void
PagesMapAndSelectedPagesId: DataSourceNotionPageMap
currentWorkspacePages: { page_id: string }[] | undefined
clearOnlineDocumentData: () => void
clearWebsiteCrawlData: () => void
clearOnlineDriveData: () => void
setDatasource: (ds: Datasource) => void
}
/**
* Hook for datasource-related actions (preview, process, etc.)
*/
export const useDatasourceActions = ({
datasource,
datasourceType,
pipelineId,
dataSourceStore,
setEstimateData,
setBatchId,
setDocuments,
handleNextStep,
PagesMapAndSelectedPagesId,
currentWorkspacePages,
clearOnlineDocumentData,
clearWebsiteCrawlData,
clearOnlineDriveData,
setDatasource,
}: DatasourceActionsParams) => {
const isPreview = useRef(false)
const formRef = useRef<{ submit: () => void } | null>(null)
const { mutateAsync: runPublishedPipeline, isIdle, isPending } = useRunPublishedPipeline()
// Build datasource info for preview (single item)
const buildPreviewDatasourceInfo = useCallback(() => {
const {
previewLocalFileRef,
previewOnlineDocumentRef,
previewWebsitePageRef,
previewOnlineDriveFileRef,
currentCredentialId,
bucket,
} = dataSourceStore.getState()
const datasourceInfoList: Record<string, unknown>[] = []
if (datasourceType === DatasourceType.localFile && previewLocalFileRef.current) {
datasourceInfoList.push(buildLocalFileDatasourceInfo(
previewLocalFileRef.current as File,
currentCredentialId,
))
}
if (datasourceType === DatasourceType.onlineDocument && previewOnlineDocumentRef.current) {
datasourceInfoList.push(buildOnlineDocumentDatasourceInfo(
previewOnlineDocumentRef.current,
currentCredentialId,
))
}
if (datasourceType === DatasourceType.websiteCrawl && previewWebsitePageRef.current) {
datasourceInfoList.push(buildWebsiteCrawlDatasourceInfo(
previewWebsitePageRef.current,
currentCredentialId,
))
}
if (datasourceType === DatasourceType.onlineDrive && previewOnlineDriveFileRef.current) {
datasourceInfoList.push(buildOnlineDriveDatasourceInfo(
previewOnlineDriveFileRef.current,
bucket,
currentCredentialId,
))
}
return datasourceInfoList
}, [dataSourceStore, datasourceType])
// Build datasource info for processing (all items)
const buildProcessDatasourceInfo = useCallback(() => {
const {
currentCredentialId,
localFileList,
onlineDocuments,
websitePages,
bucket,
selectedFileIds,
onlineDriveFileList,
} = dataSourceStore.getState()
const datasourceInfoList: Record<string, unknown>[] = []
if (datasourceType === DatasourceType.localFile) {
localFileList.forEach((file) => {
datasourceInfoList.push(buildLocalFileDatasourceInfo(file.file, currentCredentialId))
})
}
if (datasourceType === DatasourceType.onlineDocument) {
onlineDocuments.forEach((page) => {
datasourceInfoList.push(buildOnlineDocumentDatasourceInfo(page, currentCredentialId))
})
}
if (datasourceType === DatasourceType.websiteCrawl) {
websitePages.forEach((page) => {
datasourceInfoList.push(buildWebsiteCrawlDatasourceInfo(page, currentCredentialId))
})
}
if (datasourceType === DatasourceType.onlineDrive) {
selectedFileIds.forEach((id) => {
const file = onlineDriveFileList.find(f => f.id === id)
if (file)
datasourceInfoList.push(buildOnlineDriveDatasourceInfo(file, bucket, currentCredentialId))
})
}
return datasourceInfoList
}, [dataSourceStore, datasourceType])
// Handle chunk preview
const handlePreviewChunks = useCallback(async (data: Record<string, unknown>) => {
if (!datasource || !pipelineId)
return
const datasourceInfoList = buildPreviewDatasourceInfo()
await runPublishedPipeline({
pipeline_id: pipelineId,
inputs: data,
start_node_id: datasource.nodeId,
datasource_type: datasourceType as DatasourceType,
datasource_info_list: datasourceInfoList,
is_preview: true,
}, {
onSuccess: (res) => {
setEstimateData((res as PublishedPipelineRunPreviewResponse).data.outputs)
},
})
}, [datasource, pipelineId, datasourceType, buildPreviewDatasourceInfo, runPublishedPipeline, setEstimateData])
// Handle document processing
const handleProcess = useCallback(async (data: Record<string, unknown>) => {
if (!datasource || !pipelineId)
return
const datasourceInfoList = buildProcessDatasourceInfo()
await runPublishedPipeline({
pipeline_id: pipelineId,
inputs: data,
start_node_id: datasource.nodeId,
datasource_type: datasourceType as DatasourceType,
datasource_info_list: datasourceInfoList,
is_preview: false,
}, {
onSuccess: (res) => {
setBatchId((res as PublishedPipelineRunResponse).batch || '')
setDocuments((res as PublishedPipelineRunResponse).documents || [])
handleNextStep()
trackEvent('dataset_document_added', {
data_source_type: datasourceType,
indexing_technique: 'pipeline',
})
},
})
}, [datasource, pipelineId, datasourceType, buildProcessDatasourceInfo, runPublishedPipeline, setBatchId, setDocuments, handleNextStep])
// Form submission handlers
const onClickProcess = useCallback(() => {
isPreview.current = false
formRef.current?.submit()
}, [])
const onClickPreview = useCallback(() => {
isPreview.current = true
formRef.current?.submit()
}, [])
const handleSubmit = useCallback((data: Record<string, unknown>) => {
if (isPreview.current)
handlePreviewChunks(data)
else
handleProcess(data)
}, [handlePreviewChunks, handleProcess])
// Preview change handlers
const handlePreviewFileChange = useCallback((file: DocumentItem) => {
const { previewLocalFileRef } = dataSourceStore.getState()
previewLocalFileRef.current = file
onClickPreview()
}, [dataSourceStore, onClickPreview])
const handlePreviewOnlineDocumentChange = useCallback((page: NotionPage) => {
const { previewOnlineDocumentRef } = dataSourceStore.getState()
previewOnlineDocumentRef.current = page
onClickPreview()
}, [dataSourceStore, onClickPreview])
const handlePreviewWebsiteChange = useCallback((website: CrawlResultItem) => {
const { previewWebsitePageRef } = dataSourceStore.getState()
previewWebsitePageRef.current = website
onClickPreview()
}, [dataSourceStore, onClickPreview])
const handlePreviewOnlineDriveFileChange = useCallback((file: OnlineDriveFile) => {
const { previewOnlineDriveFileRef } = dataSourceStore.getState()
previewOnlineDriveFileRef.current = file
onClickPreview()
}, [dataSourceStore, onClickPreview])
// Select all handler
const handleSelectAll = useCallback(() => {
const {
onlineDocuments,
onlineDriveFileList,
selectedFileIds,
setOnlineDocuments,
setSelectedFileIds,
setSelectedPagesId,
} = dataSourceStore.getState()
if (datasourceType === DatasourceType.onlineDocument) {
const allIds = currentWorkspacePages?.map(page => page.page_id) || []
if (onlineDocuments.length < allIds.length) {
const selectedPages = Array.from(allIds).map(pageId => PagesMapAndSelectedPagesId[pageId])
setOnlineDocuments(selectedPages)
setSelectedPagesId(new Set(allIds))
}
else {
setOnlineDocuments([])
setSelectedPagesId(new Set())
}
}
if (datasourceType === DatasourceType.onlineDrive) {
const allKeys = onlineDriveFileList.filter(item => item.type !== 'bucket').map(file => file.id)
if (selectedFileIds.length < allKeys.length)
setSelectedFileIds(allKeys)
else
setSelectedFileIds([])
}
}, [PagesMapAndSelectedPagesId, currentWorkspacePages, dataSourceStore, datasourceType])
// Clear datasource data based on type
const clearDataSourceData = useCallback((dataSource: Datasource) => {
const providerType = dataSource.nodeData.provider_type
const clearFunctions: Record<string, () => void> = {
[DatasourceType.onlineDocument]: clearOnlineDocumentData,
[DatasourceType.websiteCrawl]: clearWebsiteCrawlData,
[DatasourceType.onlineDrive]: clearOnlineDriveData,
[DatasourceType.localFile]: () => {},
}
clearFunctions[providerType]?.()
}, [clearOnlineDocumentData, clearOnlineDriveData, clearWebsiteCrawlData])
// Switch datasource handler
const handleSwitchDataSource = useCallback((dataSource: Datasource) => {
const {
setCurrentCredentialId,
currentNodeIdRef,
} = dataSourceStore.getState()
clearDataSourceData(dataSource)
setCurrentCredentialId('')
currentNodeIdRef.current = dataSource.nodeId
setDatasource(dataSource)
}, [clearDataSourceData, dataSourceStore, setDatasource])
// Credential change handler
const handleCredentialChange = useCallback((credentialId: string) => {
const { setCurrentCredentialId } = dataSourceStore.getState()
if (datasource)
clearDataSourceData(datasource)
setCurrentCredentialId(credentialId)
}, [clearDataSourceData, dataSourceStore, datasource])
return {
isPreview,
formRef,
isIdle,
isPending,
onClickProcess,
onClickPreview,
handleSubmit,
handlePreviewFileChange,
handlePreviewOnlineDocumentChange,
handlePreviewWebsiteChange,
handlePreviewOnlineDriveFileChange,
handleSelectAll,
handleSwitchDataSource,
handleCredentialChange,
}
}

View File

@ -0,0 +1,27 @@
import type { DataSourceOption } from '@/app/components/rag-pipeline/components/panel/test-run/types'
import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types'
import type { Node } from '@/app/components/workflow/types'
import { useMemo } from 'react'
import { BlockEnum } from '@/app/components/workflow/types'
/**
* Hook for getting datasource options from pipeline nodes
*/
export const useDatasourceOptions = (pipelineNodes: Node<DataSourceNodeType>[]) => {
const datasourceNodes = pipelineNodes.filter(node => node.data.type === BlockEnum.DataSource)
const options = useMemo(() => {
const options: DataSourceOption[] = []
datasourceNodes.forEach((node) => {
const label = node.data.title
options.push({
label,
value: node.id,
data: node.data,
})
})
return options
}, [datasourceNodes])
return options
}

View File

@ -0,0 +1,176 @@
import type { DataSourceNotionPageMap, DataSourceNotionWorkspace } from '@/models/common'
import { useCallback, useMemo } from 'react'
import { useShallow } from 'zustand/react/shallow'
import { CrawlStep } from '@/models/datasets'
import { useDataSourceStore, useDataSourceStoreWithSelector } from '../data-source/store'
/**
* Hook for local file datasource store operations
*/
export const useLocalFile = () => {
const {
localFileList,
currentLocalFile,
} = useDataSourceStoreWithSelector(useShallow(state => ({
localFileList: state.localFileList,
currentLocalFile: state.currentLocalFile,
})))
const dataSourceStore = useDataSourceStore()
const allFileLoaded = useMemo(() => (localFileList.length > 0 && localFileList.every(file => file.file.id)), [localFileList])
const hidePreviewLocalFile = useCallback(() => {
const { setCurrentLocalFile } = dataSourceStore.getState()
setCurrentLocalFile(undefined)
}, [dataSourceStore])
return {
localFileList,
allFileLoaded,
currentLocalFile,
hidePreviewLocalFile,
}
}
/**
* Hook for online document datasource store operations
*/
export const useOnlineDocument = () => {
const {
documentsData,
onlineDocuments,
currentDocument,
} = useDataSourceStoreWithSelector(useShallow(state => ({
documentsData: state.documentsData,
onlineDocuments: state.onlineDocuments,
currentDocument: state.currentDocument,
})))
const dataSourceStore = useDataSourceStore()
const currentWorkspace = documentsData[0]
const PagesMapAndSelectedPagesId: DataSourceNotionPageMap = useMemo(() => {
const pagesMap = (documentsData || []).reduce((prev: DataSourceNotionPageMap, next: DataSourceNotionWorkspace) => {
next.pages.forEach((page) => {
prev[page.page_id] = {
...page,
workspace_id: next.workspace_id,
}
})
return prev
}, {})
return pagesMap
}, [documentsData])
const hidePreviewOnlineDocument = useCallback(() => {
const { setCurrentDocument } = dataSourceStore.getState()
setCurrentDocument(undefined)
}, [dataSourceStore])
const clearOnlineDocumentData = useCallback(() => {
const {
setDocumentsData,
setSearchValue,
setSelectedPagesId,
setOnlineDocuments,
setCurrentDocument,
} = dataSourceStore.getState()
setDocumentsData([])
setSearchValue('')
setSelectedPagesId(new Set())
setOnlineDocuments([])
setCurrentDocument(undefined)
}, [dataSourceStore])
return {
currentWorkspace,
onlineDocuments,
currentDocument,
PagesMapAndSelectedPagesId,
hidePreviewOnlineDocument,
clearOnlineDocumentData,
}
}
/**
* Hook for website crawl datasource store operations
*/
export const useWebsiteCrawl = () => {
const {
websitePages,
currentWebsite,
} = useDataSourceStoreWithSelector(useShallow(state => ({
websitePages: state.websitePages,
currentWebsite: state.currentWebsite,
})))
const dataSourceStore = useDataSourceStore()
const hideWebsitePreview = useCallback(() => {
const { setCurrentWebsite, setPreviewIndex } = dataSourceStore.getState()
setCurrentWebsite(undefined)
setPreviewIndex(-1)
}, [dataSourceStore])
const clearWebsiteCrawlData = useCallback(() => {
const {
setStep,
setCrawlResult,
setWebsitePages,
setPreviewIndex,
setCurrentWebsite,
} = dataSourceStore.getState()
setStep(CrawlStep.init)
setCrawlResult(undefined)
setCurrentWebsite(undefined)
setWebsitePages([])
setPreviewIndex(-1)
}, [dataSourceStore])
return {
websitePages,
currentWebsite,
hideWebsitePreview,
clearWebsiteCrawlData,
}
}
/**
* Hook for online drive datasource store operations
*/
export const useOnlineDrive = () => {
const {
onlineDriveFileList,
selectedFileIds,
} = useDataSourceStoreWithSelector(useShallow(state => ({
onlineDriveFileList: state.onlineDriveFileList,
selectedFileIds: state.selectedFileIds,
})))
const dataSourceStore = useDataSourceStore()
const selectedOnlineDriveFileList = useMemo(() => {
return selectedFileIds.map(id => onlineDriveFileList.find(item => item.id === id)!)
}, [onlineDriveFileList, selectedFileIds])
const clearOnlineDriveData = useCallback(() => {
const {
setOnlineDriveFileList,
setBucket,
setPrefix,
setKeywords,
setSelectedFileIds,
} = dataSourceStore.getState()
setOnlineDriveFileList([])
setBucket('')
setPrefix([])
setKeywords('')
setSelectedFileIds([])
}, [dataSourceStore])
return {
onlineDriveFileList,
selectedFileIds,
selectedOnlineDriveFileList,
clearOnlineDriveData,
}
}

View File

@ -0,0 +1,132 @@
import type { Datasource } from '@/app/components/rag-pipeline/components/panel/test-run/types'
import type { OnlineDriveFile } from '@/models/pipeline'
import { useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import { DatasourceType } from '@/models/pipeline'
type DatasourceUIStateParams = {
datasource: Datasource | undefined
allFileLoaded: boolean
localFileListLength: number
onlineDocumentsLength: number
websitePagesLength: number
selectedFileIdsLength: number
onlineDriveFileList: OnlineDriveFile[]
isVectorSpaceFull: boolean
enableBilling: boolean
currentWorkspacePagesLength: number
fileUploadConfig: { file_size_limit: number, batch_count_limit: number }
}
/**
* Hook for computing datasource UI state based on datasource type
*/
export const useDatasourceUIState = ({
datasource,
allFileLoaded,
localFileListLength,
onlineDocumentsLength,
websitePagesLength,
selectedFileIdsLength,
onlineDriveFileList,
isVectorSpaceFull,
enableBilling,
currentWorkspacePagesLength,
fileUploadConfig,
}: DatasourceUIStateParams) => {
const { t } = useTranslation()
const datasourceType = datasource?.nodeData.provider_type
const isShowVectorSpaceFull = useMemo(() => {
if (!datasource || !datasourceType)
return false
// Lookup table for vector space full condition check
const vectorSpaceFullConditions: Record<string, boolean> = {
[DatasourceType.localFile]: allFileLoaded,
[DatasourceType.onlineDocument]: onlineDocumentsLength > 0,
[DatasourceType.websiteCrawl]: websitePagesLength > 0,
[DatasourceType.onlineDrive]: onlineDriveFileList.length > 0,
}
const condition = vectorSpaceFullConditions[datasourceType]
return condition && isVectorSpaceFull && enableBilling
}, [datasource, datasourceType, allFileLoaded, onlineDocumentsLength, websitePagesLength, onlineDriveFileList.length, isVectorSpaceFull, enableBilling])
// Lookup table for next button disabled conditions
const nextBtnDisabled = useMemo(() => {
if (!datasource || !datasourceType)
return true
const disabledConditions: Record<string, boolean> = {
[DatasourceType.localFile]: isShowVectorSpaceFull || localFileListLength === 0 || !allFileLoaded,
[DatasourceType.onlineDocument]: isShowVectorSpaceFull || onlineDocumentsLength === 0,
[DatasourceType.websiteCrawl]: isShowVectorSpaceFull || websitePagesLength === 0,
[DatasourceType.onlineDrive]: isShowVectorSpaceFull || selectedFileIdsLength === 0,
}
return disabledConditions[datasourceType] ?? true
}, [datasource, datasourceType, isShowVectorSpaceFull, localFileListLength, allFileLoaded, onlineDocumentsLength, websitePagesLength, selectedFileIdsLength])
// Check if select all should be shown
const showSelect = useMemo(() => {
if (datasourceType === DatasourceType.onlineDocument)
return currentWorkspacePagesLength > 0
if (datasourceType === DatasourceType.onlineDrive) {
const nonBucketItems = onlineDriveFileList.filter(item => item.type !== 'bucket')
const isBucketList = onlineDriveFileList.some(file => file.type === 'bucket')
return !isBucketList && nonBucketItems.length > 0
}
return false
}, [currentWorkspacePagesLength, datasourceType, onlineDriveFileList])
// Total selectable options count
const totalOptions = useMemo(() => {
if (datasourceType === DatasourceType.onlineDocument)
return currentWorkspacePagesLength
if (datasourceType === DatasourceType.onlineDrive)
return onlineDriveFileList.filter(item => item.type !== 'bucket').length
return undefined
}, [currentWorkspacePagesLength, datasourceType, onlineDriveFileList])
// Selected options count
const selectedOptions = useMemo(() => {
if (datasourceType === DatasourceType.onlineDocument)
return onlineDocumentsLength
if (datasourceType === DatasourceType.onlineDrive)
return selectedFileIdsLength
return undefined
}, [datasourceType, onlineDocumentsLength, selectedFileIdsLength])
// Tip message for selection
const tip = useMemo(() => {
if (datasourceType === DatasourceType.onlineDocument)
return t('addDocuments.selectOnlineDocumentTip', { ns: 'datasetPipeline', count: 50 })
if (datasourceType === DatasourceType.onlineDrive) {
return t('addDocuments.selectOnlineDriveTip', {
ns: 'datasetPipeline',
count: fileUploadConfig.batch_count_limit,
fileSize: fileUploadConfig.file_size_limit,
})
}
return ''
}, [datasourceType, fileUploadConfig.batch_count_limit, fileUploadConfig.file_size_limit, t])
return {
datasourceType,
isShowVectorSpaceFull,
nextBtnDisabled,
showSelect,
totalOptions,
selectedOptions,
tip,
}
}