mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-26 13:35:43 +08:00
feat: pipeline add preprocess (#13302)
### What problem does this PR solve? feat: pipeline add preprocess ### Type of change - [x] New Feature (non-breaking change which adds functionality) Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
@ -239,6 +239,31 @@ export const MultiSelect = React.forwardRef<
|
||||
'options' in option ? option.options : [option],
|
||||
);
|
||||
}, [options]);
|
||||
|
||||
const disabledValueSet = React.useMemo(() => {
|
||||
return new Set(
|
||||
flatOptions.filter((option) => option.disabled).map((option) => option.value),
|
||||
);
|
||||
}, [flatOptions]);
|
||||
|
||||
const preserveDisabledValues = React.useCallback(
|
||||
(values: string[]) => {
|
||||
const disabledSelectedValues = selectedValues.filter((value) =>
|
||||
disabledValueSet.has(value),
|
||||
);
|
||||
|
||||
return Array.from(
|
||||
new Set<string>([...disabledSelectedValues, ...values]),
|
||||
);
|
||||
},
|
||||
[disabledValueSet, selectedValues],
|
||||
);
|
||||
|
||||
const canRemoveValue = React.useCallback(
|
||||
(value: string) => !disabledValueSet.has(value),
|
||||
[disabledValueSet],
|
||||
);
|
||||
|
||||
const handleInputKeyDown = (
|
||||
event: React.KeyboardEvent<HTMLInputElement>,
|
||||
) => {
|
||||
@ -246,13 +271,26 @@ export const MultiSelect = React.forwardRef<
|
||||
setIsPopoverOpen(true);
|
||||
} else if (event.key === 'Backspace' && !event.currentTarget.value) {
|
||||
const newSelectedValues = [...selectedValues];
|
||||
newSelectedValues.pop();
|
||||
const removableIndex = [...newSelectedValues]
|
||||
.reverse()
|
||||
.findIndex((value) => canRemoveValue(value));
|
||||
if (removableIndex < 0) {
|
||||
return;
|
||||
}
|
||||
newSelectedValues.splice(
|
||||
newSelectedValues.length - 1 - removableIndex,
|
||||
1,
|
||||
);
|
||||
setSelectedValues(newSelectedValues);
|
||||
onValueChange(newSelectedValues);
|
||||
}
|
||||
};
|
||||
|
||||
const toggleOption = (option: string) => {
|
||||
if (disabledValueSet.has(option)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const newSelectedValues = selectedValues.includes(option)
|
||||
? selectedValues.filter((value) => value !== option)
|
||||
: [...selectedValues, option];
|
||||
@ -261,8 +299,9 @@ export const MultiSelect = React.forwardRef<
|
||||
};
|
||||
|
||||
const handleClear = () => {
|
||||
setSelectedValues([]);
|
||||
onValueChange([]);
|
||||
const nextValues = preserveDisabledValues([]);
|
||||
setSelectedValues(nextValues);
|
||||
onValueChange(nextValues);
|
||||
};
|
||||
|
||||
const handleTogglePopover = () => {
|
||||
@ -270,7 +309,9 @@ export const MultiSelect = React.forwardRef<
|
||||
};
|
||||
|
||||
const clearExtraOptions = () => {
|
||||
const newSelectedValues = selectedValues.slice(0, maxCount);
|
||||
const newSelectedValues = preserveDisabledValues(
|
||||
selectedValues.slice(0, maxCount),
|
||||
);
|
||||
setSelectedValues(newSelectedValues);
|
||||
onValueChange(newSelectedValues);
|
||||
};
|
||||
@ -279,7 +320,9 @@ export const MultiSelect = React.forwardRef<
|
||||
if (selectedValues.length === flatOptions.length) {
|
||||
handleClear();
|
||||
} else {
|
||||
const allValues = flatOptions.map((option) => option.value);
|
||||
const allValues = preserveDisabledValues(
|
||||
flatOptions.map((option) => option.value),
|
||||
);
|
||||
setSelectedValues(allValues);
|
||||
onValueChange(allValues);
|
||||
}
|
||||
@ -325,13 +368,15 @@ export const MultiSelect = React.forwardRef<
|
||||
<div className="max-w-28 text-ellipsis overflow-hidden">
|
||||
{option?.label}
|
||||
</div>
|
||||
<XCircle
|
||||
className="h-4 w-4 cursor-pointer"
|
||||
onClick={(event) => {
|
||||
event.stopPropagation();
|
||||
toggleOption(value);
|
||||
}}
|
||||
/>
|
||||
{canRemoveValue(value) && (
|
||||
<XCircle
|
||||
className="h-4 w-4 cursor-pointer"
|
||||
onClick={(event) => {
|
||||
event.stopPropagation();
|
||||
toggleOption(value);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</Badge>
|
||||
);
|
||||
|
||||
@ -1429,6 +1429,13 @@ Example: Virtual Hosted Style`,
|
||||
pleaseUploadAtLeastOneFile: 'Please upload at least one file',
|
||||
},
|
||||
flow: {
|
||||
preprocess: {
|
||||
preprocess: 'Preprocess',
|
||||
mainContent: 'Main content',
|
||||
abstract: 'Abstract',
|
||||
author: 'Author',
|
||||
sectionTitle: 'Section title',
|
||||
},
|
||||
autoPlay: 'Auto play audio',
|
||||
downloadFileTypeTip: 'The file type to download',
|
||||
downloadFileType: 'Download file type',
|
||||
|
||||
@ -1209,6 +1209,13 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
pleaseUploadAtLeastOneFile: '请上传至少一个文件',
|
||||
},
|
||||
flow: {
|
||||
preprocess: {
|
||||
preprocess: '预处理',
|
||||
mainContent: '主内容',
|
||||
abstract: '摘要',
|
||||
author: '作者',
|
||||
sectionTitle: '章节标题',
|
||||
},
|
||||
autoPlay: '自动播放',
|
||||
downloadFileTypeTip: '文件下载的类型',
|
||||
downloadFileType: '文件类型',
|
||||
|
||||
@ -154,6 +154,22 @@ export enum StringTransformDelimiter {
|
||||
Space = ' ',
|
||||
}
|
||||
|
||||
export enum PreprocessValue {
|
||||
main_content = 'main_content',
|
||||
section_title = 'title',
|
||||
abstract = 'abstract',
|
||||
author = 'author',
|
||||
}
|
||||
|
||||
export const MAIN_CONTENT_PREPROCESS_VALUE: PreprocessValue =
|
||||
PreprocessValue.main_content;
|
||||
|
||||
export const PreprocessLabelKeyMap: Record<PreprocessValue, string> = {
|
||||
main_content: 'mainContent',
|
||||
title: 'sectionTitle',
|
||||
abstract: 'abstract',
|
||||
author: 'author',
|
||||
};
|
||||
export const initialParserValues = {
|
||||
outputs: {
|
||||
markdown: { type: 'string', value: '' },
|
||||
@ -166,35 +182,42 @@ export const initialParserValues = {
|
||||
fileFormat: FileType.PDF,
|
||||
output_format: PdfOutputFormat.Json,
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Spreadsheet,
|
||||
output_format: SpreadsheetOutputFormat.Html,
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Image,
|
||||
output_format: ImageOutputFormat.Text,
|
||||
parse_method: ImageParseMethod.OCR,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
system_prompt: '',
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Email,
|
||||
fields: Object.values(ParserFields),
|
||||
output_format: EmailOutputFormat.Text,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.TextMarkdown,
|
||||
output_format: TextMarkdownOutputFormat.Text,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Docx,
|
||||
output_format: DocxOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.PowerPoint,
|
||||
output_format: PptOutputFormat.Json,
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
@ -5,16 +5,17 @@ import {
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { BlockButton, Button } from '@/components/ui/button';
|
||||
import { Form } from '@/components/ui/form';
|
||||
import { MultiSelect } from '@/components/ui/multi-select';
|
||||
import { Separator } from '@/components/ui/separator';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { buildOptions } from '@/utils/form';
|
||||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { useHover } from 'ahooks';
|
||||
import { Trash2 } from 'lucide-react';
|
||||
import { memo, useCallback, useMemo, useRef } from 'react';
|
||||
import { memo, useCallback, useEffect, useMemo, useRef } from 'react';
|
||||
import {
|
||||
UseFieldArrayRemove,
|
||||
useFieldArray,
|
||||
UseFieldArrayRemove,
|
||||
useForm,
|
||||
useFormContext,
|
||||
} from 'react-hook-form';
|
||||
@ -24,6 +25,8 @@ import {
|
||||
FileType,
|
||||
InitialOutputFormatMap,
|
||||
initialParserValues,
|
||||
MAIN_CONTENT_PREPROCESS_VALUE,
|
||||
PreprocessValue,
|
||||
} from '../../constant/pipeline';
|
||||
import { useFormValues } from '../../hooks/use-form-values';
|
||||
import { useWatchFormChange } from '../../hooks/use-watch-form-change';
|
||||
@ -41,6 +44,74 @@ import { AudioFormFields, VideoFormFields } from './video-form-fields';
|
||||
|
||||
const outputList = buildOutputList(initialParserValues.outputs);
|
||||
|
||||
type PreprocessOptionConfig = {
|
||||
value: PreprocessValue;
|
||||
required?: boolean;
|
||||
};
|
||||
|
||||
const DefaultPreprocessOptionConfigs: PreprocessOptionConfig[] = [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
];
|
||||
|
||||
const PreprocessOptionConfigsMap: Partial<
|
||||
Record<FileType, PreprocessOptionConfig[]>
|
||||
> = {
|
||||
[FileType.PDF]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.abstract },
|
||||
{ value: PreprocessValue.author },
|
||||
{ value: PreprocessValue.section_title },
|
||||
],
|
||||
[FileType.PowerPoint]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
],
|
||||
[FileType.Spreadsheet]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
],
|
||||
[FileType.TextMarkdown]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.section_title },
|
||||
],
|
||||
[FileType.Docx]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.section_title },
|
||||
],
|
||||
};
|
||||
|
||||
function getPreprocessOptionConfigs(fileType?: FileType) {
|
||||
if (!fileType) {
|
||||
return DefaultPreprocessOptionConfigs;
|
||||
}
|
||||
|
||||
return PreprocessOptionConfigsMap[fileType] ?? DefaultPreprocessOptionConfigs;
|
||||
}
|
||||
|
||||
function normalizePreprocessValuesByFileType(
|
||||
fileType: FileType | undefined,
|
||||
values: string[] | undefined,
|
||||
) {
|
||||
const optionConfigs = getPreprocessOptionConfigs(fileType);
|
||||
const allowedValueSet = new Set(optionConfigs.map((x) => x.value));
|
||||
const requiredValues = optionConfigs
|
||||
.filter((x) => x.required)
|
||||
.map((x) => x.value);
|
||||
const normalizedOptionalValues = (Array.isArray(values) ? values : []).filter(
|
||||
(value) => allowedValueSet.has(value as PreprocessValue),
|
||||
) as PreprocessValue[];
|
||||
|
||||
return Array.from(
|
||||
new Set<PreprocessValue>([...requiredValues, ...normalizedOptionalValues]),
|
||||
);
|
||||
}
|
||||
|
||||
function isSameStringArray(a: string[] | undefined, b: string[]) {
|
||||
if (!a || a.length !== b.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return a.every((item, idx) => item === b[idx]);
|
||||
}
|
||||
|
||||
const FileFormatWidgetMap = {
|
||||
[FileType.PDF]: PdfFormFields,
|
||||
[FileType.Spreadsheet]: SpreadsheetFormFields,
|
||||
@ -63,6 +134,7 @@ export const FormSchema = z.object({
|
||||
setups: z.array(
|
||||
z.object({
|
||||
fileFormat: z.string().nullish(),
|
||||
preprocess: z.array(z.string()).optional(),
|
||||
output_format: z.string().optional(),
|
||||
parse_method: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
@ -121,6 +193,57 @@ function ParserItem({
|
||||
[form, index],
|
||||
);
|
||||
|
||||
const handlePreprocessChange = useCallback(
|
||||
(value: PreprocessValue[]) => {
|
||||
form.setValue(`setups.${index}.preprocess`, value, {
|
||||
shouldDirty: true,
|
||||
shouldValidate: true,
|
||||
shouldTouch: true,
|
||||
});
|
||||
},
|
||||
[form, index],
|
||||
);
|
||||
|
||||
const preprocessOptions = useMemo(() => {
|
||||
const optionConfigs = getPreprocessOptionConfigs(fileFormat as FileType);
|
||||
|
||||
return optionConfigs.map((optionConfig) => {
|
||||
const labelMap: Record<string, string> = {
|
||||
[MAIN_CONTENT_PREPROCESS_VALUE]: t('flow.preprocess.mainContent'),
|
||||
[PreprocessValue.section_title]: t('flow.preprocess.sectionTitle'),
|
||||
[PreprocessValue.abstract]: t('flow.preprocess.abstract'),
|
||||
[PreprocessValue.author]: t('flow.preprocess.author'),
|
||||
};
|
||||
|
||||
const label = labelMap[optionConfig.value] || optionConfig.value;
|
||||
|
||||
return {
|
||||
value: optionConfig.value,
|
||||
disabled: optionConfig.required,
|
||||
label: label,
|
||||
};
|
||||
});
|
||||
}, [fileFormat, t]);
|
||||
|
||||
useEffect(() => {
|
||||
const currentPreprocessValues = form.getValues(
|
||||
`setups.${index}.preprocess`,
|
||||
) as string[] | undefined;
|
||||
const normalizedPreprocessValues = normalizePreprocessValuesByFileType(
|
||||
fileFormat as FileType,
|
||||
currentPreprocessValues,
|
||||
);
|
||||
|
||||
if (
|
||||
!isSameStringArray(currentPreprocessValues, normalizedPreprocessValues)
|
||||
) {
|
||||
form.setValue(`setups.${index}.preprocess`, normalizedPreprocessValues, {
|
||||
shouldDirty: false,
|
||||
shouldValidate: true,
|
||||
});
|
||||
}
|
||||
}, [fileFormat, form, index]);
|
||||
|
||||
return (
|
||||
<section
|
||||
className={cn('space-y-5 py-2.5 rounded-md', {
|
||||
@ -159,6 +282,26 @@ function ParserItem({
|
||||
fileType={fileFormat as FileType}
|
||||
/>
|
||||
</div>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix(`preprocess`, prefix)}
|
||||
label={t('flow.preprocess.preprocess')}
|
||||
>
|
||||
{(field) => (
|
||||
<MultiSelect
|
||||
value={field.value || []}
|
||||
onValueChange={(val) => {
|
||||
const nextValues = normalizePreprocessValuesByFileType(
|
||||
fileFormat as FileType,
|
||||
val,
|
||||
);
|
||||
field.onChange(nextValues);
|
||||
handlePreprocessChange(nextValues);
|
||||
}}
|
||||
showSelectAll={false}
|
||||
options={preprocessOptions}
|
||||
></MultiSelect>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
{index < fieldLength - 1 && <Separator />}
|
||||
</section>
|
||||
);
|
||||
@ -192,6 +335,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
|
||||
llm_id: '',
|
||||
table_result_type: '',
|
||||
markdown_image_response_type: '',
|
||||
preprocess: [],
|
||||
});
|
||||
}, [append]);
|
||||
|
||||
|
||||
@ -214,6 +214,7 @@ function transformParserParams(params: ParserFormSchemaType) {
|
||||
ParserFormSchemaType['setups'][0] & { suffix: string[] }
|
||||
> = {
|
||||
output_format: cur.output_format,
|
||||
preprocess: cur.preprocess,
|
||||
suffix: FileTypeSuffixMap[cur.fileFormat as FileType],
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user