Merge branch 'fix/chore-fix' into dev/plugin-deploy

This commit is contained in:
Novice Lee
2025-01-09 08:48:00 +08:00
231 changed files with 3564 additions and 2382 deletions

View File

@ -31,17 +31,17 @@ import LanguageSelect from './language-select'
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import type { RetrievalConfig } from '@/types/app'
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import Toast from '@/app/components/base/toast'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { ChunkingMode, DataSourceType, RerankingModeEnum } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
@ -53,7 +53,7 @@ import type { DefaultModel } from '@/app/components/header/account-setting/model
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
import Checkbox from '@/app/components/base/checkbox'
import RadioCard from '@/app/components/base/radio-card'
import { IS_CE_EDITION } from '@/config'
import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
import Divider from '@/app/components/base/divider'
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
import Badge from '@/app/components/base/badge'
@ -90,17 +90,13 @@ type StepTwoProps = {
onCancel?: () => void
}
export enum SegmentType {
AUTO = 'automatic',
CUSTOM = 'custom',
}
export enum IndexingType {
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAXMIMUM_CHUNK_LENGTH = 500
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
const DEFAULT_OVERLAP = 50
type ParentChildConfig = {
@ -131,7 +127,6 @@ const StepTwo = ({
isSetting,
documentDetail,
isAPIKeySet,
onSetting,
datasetId,
indexingType,
dataSourceType: inCreatePageDataSourceType,
@ -162,12 +157,12 @@ const StepTwo = ({
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.CUSTOM)
const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
}, [])
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXMIMUM_CHUNK_LENGTH) // default chunk length
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
const [rules, setRules] = useState<PreProcessingRule[]>([])
@ -198,7 +193,6 @@ const StepTwo = ({
)
// QA Related
const [isLanguageSelectDisabled, _setIsLanguageSelectDisabled] = useState(false)
const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
const [docForm, setDocForm] = useState<ChunkingMode>(
(datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
@ -211,7 +205,7 @@ const StepTwo = ({
if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
setIndexType(IndexingType.QUALIFIED)
setDocForm(value)
// eslint-disable-next-line @typescript-eslint/no-use-before-define
// eslint-disable-next-line
currentEstimateMutation.reset()
}
@ -348,7 +342,7 @@ const StepTwo = ({
}
const updatePreview = () => {
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > 4000) {
if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
return
}
@ -373,13 +367,42 @@ const StepTwo = ({
model: defaultEmbeddingModel?.model || '',
},
)
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: '',
reranking_model_name: '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
} as RetrievalConfig)
useEffect(() => {
if (currentDataset?.retrieval_model_dict)
return
setRetrievalConfig({
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: !!isRerankDefaultModelValid,
reranking_model: {
reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
})
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [rerankDefaultModel, isRerankDefaultModelValid])
const getCreationParams = () => {
let params
if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) {
if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
return
}
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) {
if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
return
}
@ -389,7 +412,7 @@ const StepTwo = ({
doc_form: currentDocForm,
doc_language: docLanguage,
process_rule: getProcessRule(),
// eslint-disable-next-line ts/no-use-before-define
retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
embedding_model: embeddingModel.model, // Readonly
embedding_model_provider: embeddingModel.provider, // Readonly
@ -400,10 +423,8 @@ const StepTwo = ({
const indexMethod = getIndexing_technique()
if (
!isReRankModelSelected({
rerankDefaultModel,
isRerankDefaultModelValid: !!isRerankDefaultModelValid,
rerankModelList,
// eslint-disable-next-line ts/no-use-before-define
retrievalConfig,
indexMethod: indexMethod as string,
})
@ -411,16 +432,6 @@ const StepTwo = ({
Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
return
}
const postRetrievalConfig = ensureRerankModelSelected({
rerankDefaultModel: rerankDefaultModel!,
retrievalConfig: {
// eslint-disable-next-line @typescript-eslint/no-use-before-define
...retrievalConfig,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel,
},
indexMethod: indexMethod as string,
})
params = {
data_source: {
type: dataSourceType,
@ -432,8 +443,7 @@ const StepTwo = ({
process_rule: getProcessRule(),
doc_form: currentDocForm,
doc_language: docLanguage,
retrieval_model: postRetrievalConfig,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
} as CreateDocumentReq
@ -490,7 +500,6 @@ const StepTwo = ({
const getDefaultMode = () => {
if (documentDetail)
// @ts-expect-error fix after api refactored
setSegmentationType(documentDetail.dataset_process_rule.mode)
}
@ -525,7 +534,6 @@ const StepTwo = ({
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(data)
// eslint-disable-next-line @typescript-eslint/no-use-before-define
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
},
},
@ -545,14 +553,6 @@ const StepTwo = ({
isSetting && onSave && onSave()
}
const changeToEconomicalType = () => {
if (docForm !== ChunkingMode.text)
return
if (!hasSetIndexType)
setIndexType(IndexingType.ECONOMICAL)
}
useEffect(() => {
// fetch rules
if (!isSetting) {
@ -574,18 +574,6 @@ const StepTwo = ({
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
}, [isAPIKeySet, indexingType, datasetId])
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: rerankDefaultModel?.provider.provider,
reranking_model_name: rerankDefaultModel?.model,
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
} as RetrievalConfig)
const economyDomRef = useRef<HTMLDivElement>(null)
const isHoveringEconomy = useHover(economyDomRef)
@ -946,6 +934,7 @@ const StepTwo = ({
<div className={cn('system-md-semibold mb-1', datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
<ModelSelector
readonly={!!datasetId}
triggerClassName={datasetId ? 'opacity-50' : ''}
defaultModel={embeddingModel}
modelList={embeddingModelList}
onSelect={(model: DefaultModel) => {
@ -984,12 +973,14 @@ const StepTwo = ({
getIndexing_technique() === IndexingType.QUALIFIED
? (
<RetrievalMethodConfig
disabled={!!datasetId}
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)
: (
<EconomicalRetrievalMethodConfig
disabled={!!datasetId}
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
@ -1010,7 +1001,7 @@ const StepTwo = ({
)
: (
<div className='flex items-center mt-8 py-2'>
<Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
{!datasetId && <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>}
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
</div>
)}
@ -1081,11 +1072,11 @@ const StepTwo = ({
}
{
currentDocForm !== ChunkingMode.qa
&& <Badge text={t(
'datasetCreation.stepTwo.previewChunkCount', {
count: estimate?.total_segments || 0,
}) as string}
/>
&& <Badge text={t(
'datasetCreation.stepTwo.previewChunkCount', {
count: estimate?.total_segments || 0,
}) as string}
/>
}
</div>
</PreviewHeader>}
@ -1117,6 +1108,9 @@ const StepTwo = ({
{currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
estimate?.preview?.map((item, index) => {
const indexForLabel = index + 1
const childChunks = parentChildConfig.chunkForContext === 'full-doc'
? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
: item.child_chunks
return (
<ChunkContainer
key={item.content}
@ -1124,7 +1118,7 @@ const StepTwo = ({
characterCount={item.content.length}
>
<FormattedText>
{item.child_chunks.map((child, index) => {
{childChunks.map((child, index) => {
const indexForLabel = index + 1
return (
<PreviewSlice

View File

@ -4,7 +4,7 @@ import classNames from '@/utils/classnames'
const TriangleArrow: FC<ComponentProps<'svg'>> = props => (
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="11" viewBox="0 0 24 11" fill="none" {...props}>
<path d="M9.87868 1.12132C11.0503 -0.0502525 12.9497 -0.0502525 14.1213 1.12132L23.3137 10.3137H0.686292L9.87868 1.12132Z" fill="currentColor"/>
<path d="M9.87868 1.12132C11.0503 -0.0502525 12.9497 -0.0502525 14.1213 1.12132L23.3137 10.3137H0.686292L9.87868 1.12132Z" fill="currentColor" />
</svg>
)
@ -65,7 +65,7 @@ export const OptionCard: FC<OptionCardProps> = forwardRef((props, ref) => {
(isActive && !noHighlight)
? 'border-[1.5px] border-components-option-card-option-selected-border'
: 'border border-components-option-card-option-border',
disabled && 'opacity-50 cursor-not-allowed',
disabled && 'opacity-50 pointer-events-none',
className,
)}
style={{