feat: pipeline add preprocess (#13302)

### What problem does this PR solve?

feat: pipeline add preprocess

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
chanx
2026-03-02 11:50:48 +08:00
committed by GitHub
parent cf3d3c7c89
commit 0cdddea59a
6 changed files with 241 additions and 14 deletions

View File

@ -239,6 +239,31 @@ export const MultiSelect = React.forwardRef<
'options' in option ? option.options : [option],
);
}, [options]);
const disabledValueSet = React.useMemo(() => {
return new Set(
flatOptions.filter((option) => option.disabled).map((option) => option.value),
);
}, [flatOptions]);
const preserveDisabledValues = React.useCallback(
(values: string[]) => {
const disabledSelectedValues = selectedValues.filter((value) =>
disabledValueSet.has(value),
);
return Array.from(
new Set<string>([...disabledSelectedValues, ...values]),
);
},
[disabledValueSet, selectedValues],
);
const canRemoveValue = React.useCallback(
(value: string) => !disabledValueSet.has(value),
[disabledValueSet],
);
const handleInputKeyDown = (
event: React.KeyboardEvent<HTMLInputElement>,
) => {
@ -246,13 +271,26 @@ export const MultiSelect = React.forwardRef<
setIsPopoverOpen(true);
} else if (event.key === 'Backspace' && !event.currentTarget.value) {
const newSelectedValues = [...selectedValues];
newSelectedValues.pop();
const removableIndex = [...newSelectedValues]
.reverse()
.findIndex((value) => canRemoveValue(value));
if (removableIndex < 0) {
return;
}
newSelectedValues.splice(
newSelectedValues.length - 1 - removableIndex,
1,
);
setSelectedValues(newSelectedValues);
onValueChange(newSelectedValues);
}
};
const toggleOption = (option: string) => {
if (disabledValueSet.has(option)) {
return;
}
const newSelectedValues = selectedValues.includes(option)
? selectedValues.filter((value) => value !== option)
: [...selectedValues, option];
@ -261,8 +299,9 @@ export const MultiSelect = React.forwardRef<
};
const handleClear = () => {
setSelectedValues([]);
onValueChange([]);
const nextValues = preserveDisabledValues([]);
setSelectedValues(nextValues);
onValueChange(nextValues);
};
const handleTogglePopover = () => {
@ -270,7 +309,9 @@ export const MultiSelect = React.forwardRef<
};
const clearExtraOptions = () => {
const newSelectedValues = selectedValues.slice(0, maxCount);
const newSelectedValues = preserveDisabledValues(
selectedValues.slice(0, maxCount),
);
setSelectedValues(newSelectedValues);
onValueChange(newSelectedValues);
};
@ -279,7 +320,9 @@ export const MultiSelect = React.forwardRef<
if (selectedValues.length === flatOptions.length) {
handleClear();
} else {
const allValues = flatOptions.map((option) => option.value);
const allValues = preserveDisabledValues(
flatOptions.map((option) => option.value),
);
setSelectedValues(allValues);
onValueChange(allValues);
}
@ -325,13 +368,15 @@ export const MultiSelect = React.forwardRef<
<div className="max-w-28 text-ellipsis overflow-hidden">
{option?.label}
</div>
<XCircle
className="h-4 w-4 cursor-pointer"
onClick={(event) => {
event.stopPropagation();
toggleOption(value);
}}
/>
{canRemoveValue(value) && (
<XCircle
className="h-4 w-4 cursor-pointer"
onClick={(event) => {
event.stopPropagation();
toggleOption(value);
}}
/>
)}
</div>
</Badge>
);

View File

@ -1429,6 +1429,13 @@ Example: Virtual Hosted Style`,
pleaseUploadAtLeastOneFile: 'Please upload at least one file',
},
flow: {
preprocess: {
preprocess: 'Preprocess',
mainContent: 'Main content',
abstract: 'Abstract',
author: 'Author',
sectionTitle: 'Section title',
},
autoPlay: 'Auto play audio',
downloadFileTypeTip: 'The file type to download',
downloadFileType: 'Download file type',

View File

@ -1209,6 +1209,13 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
pleaseUploadAtLeastOneFile: '请上传至少一个文件',
},
flow: {
preprocess: {
preprocess: '预处理',
mainContent: '主内容',
abstract: '摘要',
author: '作者',
sectionTitle: '章节标题',
},
autoPlay: '自动播放',
downloadFileTypeTip: '文件下载的类型',
downloadFileType: '文件类型',

View File

@ -154,6 +154,22 @@ export enum StringTransformDelimiter {
Space = ' ',
}
export enum PreprocessValue {
main_content = 'main_content',
section_title = 'title',
abstract = 'abstract',
author = 'author',
}
export const MAIN_CONTENT_PREPROCESS_VALUE: PreprocessValue =
PreprocessValue.main_content;
export const PreprocessLabelKeyMap: Record<PreprocessValue, string> = {
main_content: 'mainContent',
title: 'sectionTitle',
abstract: 'abstract',
author: 'author',
};
export const initialParserValues = {
outputs: {
markdown: { type: 'string', value: '' },
@ -166,35 +182,42 @@ export const initialParserValues = {
fileFormat: FileType.PDF,
output_format: PdfOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.Image,
output_format: ImageOutputFormat.Text,
parse_method: ImageParseMethod.OCR,
preprocess: PreprocessValue.main_content,
system_prompt: '',
},
{
fileFormat: FileType.Email,
fields: Object.values(ParserFields),
output_format: EmailOutputFormat.Text,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.TextMarkdown,
output_format: TextMarkdownOutputFormat.Text,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.Docx,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.PowerPoint,
output_format: PptOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
},
],
};

View File

@ -5,16 +5,17 @@ import {
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { BlockButton, Button } from '@/components/ui/button';
import { Form } from '@/components/ui/form';
import { MultiSelect } from '@/components/ui/multi-select';
import { Separator } from '@/components/ui/separator';
import { cn } from '@/lib/utils';
import { buildOptions } from '@/utils/form';
import { zodResolver } from '@hookform/resolvers/zod';
import { useHover } from 'ahooks';
import { Trash2 } from 'lucide-react';
import { memo, useCallback, useMemo, useRef } from 'react';
import { memo, useCallback, useEffect, useMemo, useRef } from 'react';
import {
UseFieldArrayRemove,
useFieldArray,
UseFieldArrayRemove,
useForm,
useFormContext,
} from 'react-hook-form';
@ -24,6 +25,8 @@ import {
FileType,
InitialOutputFormatMap,
initialParserValues,
MAIN_CONTENT_PREPROCESS_VALUE,
PreprocessValue,
} from '../../constant/pipeline';
import { useFormValues } from '../../hooks/use-form-values';
import { useWatchFormChange } from '../../hooks/use-watch-form-change';
@ -41,6 +44,74 @@ import { AudioFormFields, VideoFormFields } from './video-form-fields';
const outputList = buildOutputList(initialParserValues.outputs);
type PreprocessOptionConfig = {
value: PreprocessValue;
required?: boolean;
};
const DefaultPreprocessOptionConfigs: PreprocessOptionConfig[] = [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
];
const PreprocessOptionConfigsMap: Partial<
Record<FileType, PreprocessOptionConfig[]>
> = {
[FileType.PDF]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.abstract },
{ value: PreprocessValue.author },
{ value: PreprocessValue.section_title },
],
[FileType.PowerPoint]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
],
[FileType.Spreadsheet]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
],
[FileType.TextMarkdown]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.section_title },
],
[FileType.Docx]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.section_title },
],
};
function getPreprocessOptionConfigs(fileType?: FileType) {
if (!fileType) {
return DefaultPreprocessOptionConfigs;
}
return PreprocessOptionConfigsMap[fileType] ?? DefaultPreprocessOptionConfigs;
}
function normalizePreprocessValuesByFileType(
fileType: FileType | undefined,
values: string[] | undefined,
) {
const optionConfigs = getPreprocessOptionConfigs(fileType);
const allowedValueSet = new Set(optionConfigs.map((x) => x.value));
const requiredValues = optionConfigs
.filter((x) => x.required)
.map((x) => x.value);
const normalizedOptionalValues = (Array.isArray(values) ? values : []).filter(
(value) => allowedValueSet.has(value as PreprocessValue),
) as PreprocessValue[];
return Array.from(
new Set<PreprocessValue>([...requiredValues, ...normalizedOptionalValues]),
);
}
function isSameStringArray(a: string[] | undefined, b: string[]) {
if (!a || a.length !== b.length) {
return false;
}
return a.every((item, idx) => item === b[idx]);
}
const FileFormatWidgetMap = {
[FileType.PDF]: PdfFormFields,
[FileType.Spreadsheet]: SpreadsheetFormFields,
@ -63,6 +134,7 @@ export const FormSchema = z.object({
setups: z.array(
z.object({
fileFormat: z.string().nullish(),
preprocess: z.array(z.string()).optional(),
output_format: z.string().optional(),
parse_method: z.string().optional(),
lang: z.string().optional(),
@ -121,6 +193,57 @@ function ParserItem({
[form, index],
);
const handlePreprocessChange = useCallback(
(value: PreprocessValue[]) => {
form.setValue(`setups.${index}.preprocess`, value, {
shouldDirty: true,
shouldValidate: true,
shouldTouch: true,
});
},
[form, index],
);
const preprocessOptions = useMemo(() => {
const optionConfigs = getPreprocessOptionConfigs(fileFormat as FileType);
return optionConfigs.map((optionConfig) => {
const labelMap: Record<string, string> = {
[MAIN_CONTENT_PREPROCESS_VALUE]: t('flow.preprocess.mainContent'),
[PreprocessValue.section_title]: t('flow.preprocess.sectionTitle'),
[PreprocessValue.abstract]: t('flow.preprocess.abstract'),
[PreprocessValue.author]: t('flow.preprocess.author'),
};
const label = labelMap[optionConfig.value] || optionConfig.value;
return {
value: optionConfig.value,
disabled: optionConfig.required,
label: label,
};
});
}, [fileFormat, t]);
useEffect(() => {
const currentPreprocessValues = form.getValues(
`setups.${index}.preprocess`,
) as string[] | undefined;
const normalizedPreprocessValues = normalizePreprocessValuesByFileType(
fileFormat as FileType,
currentPreprocessValues,
);
if (
!isSameStringArray(currentPreprocessValues, normalizedPreprocessValues)
) {
form.setValue(`setups.${index}.preprocess`, normalizedPreprocessValues, {
shouldDirty: false,
shouldValidate: true,
});
}
}, [fileFormat, form, index]);
return (
<section
className={cn('space-y-5 py-2.5 rounded-md', {
@ -159,6 +282,26 @@ function ParserItem({
fileType={fileFormat as FileType}
/>
</div>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`preprocess`, prefix)}
label={t('flow.preprocess.preprocess')}
>
{(field) => (
<MultiSelect
value={field.value || []}
onValueChange={(val) => {
const nextValues = normalizePreprocessValuesByFileType(
fileFormat as FileType,
val,
);
field.onChange(nextValues);
handlePreprocessChange(nextValues);
}}
showSelectAll={false}
options={preprocessOptions}
></MultiSelect>
)}
</RAGFlowFormItem>
{index < fieldLength - 1 && <Separator />}
</section>
);
@ -192,6 +335,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
llm_id: '',
table_result_type: '',
markdown_image_response_type: '',
preprocess: [],
});
}, [append]);

View File

@ -214,6 +214,7 @@ function transformParserParams(params: ParserFormSchemaType) {
ParserFormSchemaType['setups'][0] & { suffix: string[] }
> = {
output_format: cur.output_format,
preprocess: cur.preprocess,
suffix: FileTypeSuffixMap[cur.fileFormat as FileType],
};