mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-28 14:27:49 +08:00
… ### What problem does this PR solve? Closes #13857 Parent-child chunking was introduced in v0.23.0 but is only configurable through the web UI. Users managing datasets programmatically cannot enable it via the HTTP API or Python SDK because `ParserConfig` uses `extra="forbid"`, rejecting the `children_delimiter` field at validation. ### What does this PR change? Adds a `parent_child` nested config to `ParserConfig`, following the same pattern as `raptor` and `graphrag`: ```json "parser_config": { "parent_child": { "use_parent_child": true, "children_delimiter": "\n" } } ``` - api/utils/validation_utils.py — new ParentChildConfig model, added to ParserConfig - api/utils/api_utils.py — naive defaults + flatten to children_delimiter for the execution layer - api/apps/services/dataset_api_service.py — flatten on the update path - test/testcases/configs.py — updated DEFAULT_PARSER_CONFIG - test/testcases/test_http_api/test_dataset_management/test_create_dataset.py — 4 valid + 2 invalid test cases No changes to the execution layer (rag/app/naive.py, rag/nlp/search.py). Existing UI flow via ext is unaffected. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added parent-child chunking configuration for dataset creation and updates with new `use_parent_child` toggle and customizable `children_delimiter` setting to specify how parent chunks are split into child chunks. * **Documentation** * Updated HTTP and Python API references with parent-child chunking configuration details and examples. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
75 lines
2.5 KiB
Python
75 lines
2.5 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import os
|
|
|
|
import pytest
|
|
|
|
HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380")
|
|
VERSION = "v1"
|
|
ZHIPU_AI_API_KEY = os.getenv("ZHIPU_AI_API_KEY")
|
|
if ZHIPU_AI_API_KEY is None:
|
|
pytest.exit("Error: Environment variable ZHIPU_AI_API_KEY must be set")
|
|
|
|
EMAIL = "qa@infiniflow.org"
|
|
# password is "123"
|
|
PASSWORD = """ctAseGvejiaSWWZ88T/m4FQVOpQyUvP+x7sXtdv3feqZACiQleuewkUi35E16wSd5C5QcnkkcV9cYc8TKPTRZlxappDuirxghxoOvFcJxFU4ixLsD
|
|
fN33jCHRoDUW81IH9zjij/vaw8IbVyb6vuwg6MX6inOEBRRzVbRYxXOu1wkWY6SsI8X70oF9aeLFp/PzQpjoe/YbSqpTq8qqrmHzn9vO+yvyYyvmDsphXe
|
|
X8f7fp9c7vUsfOCkM+gHY3PadG+QHa7KI7mzTKgUTZImK6BZtfRBATDTthEUbbaTewY4H0MnWiCeeDhcbeQao6cFy1To8pE3RpmxnGnS8BsBn8w=="""
|
|
|
|
INVALID_API_TOKEN = "invalid_key_123"
|
|
INVALID_ID_32 = "0" * 32
|
|
DATASET_NAME_LIMIT = 128
|
|
DOCUMENT_NAME_LIMIT = 255
|
|
CHAT_ASSISTANT_NAME_LIMIT = 255
|
|
SESSION_WITH_CHAT_NAME_LIMIT = 255
|
|
|
|
DEFAULT_PARSER_CONFIG = {
|
|
"layout_recognize": "DeepDOC",
|
|
"chunk_token_num": 512,
|
|
"delimiter": "\n",
|
|
"auto_keywords": 0,
|
|
"auto_questions": 0,
|
|
"html4excel": False,
|
|
"image_context_size": 0,
|
|
"table_context_size": 0,
|
|
"topn_tags": 3,
|
|
"llm_id": "glm-4-flash@ZHIPU-AI",
|
|
"raptor": {
|
|
"use_raptor": True,
|
|
"prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.",
|
|
"max_token": 256,
|
|
"threshold": 0.1,
|
|
"max_cluster": 64,
|
|
"random_seed": 0,
|
|
},
|
|
"graphrag": {
|
|
"use_graphrag": True,
|
|
"entity_types": [
|
|
"organization",
|
|
"person",
|
|
"geo",
|
|
"event",
|
|
"category",
|
|
],
|
|
"method": "light",
|
|
},
|
|
"parent_child": {
|
|
"use_parent_child": False,
|
|
"children_delimiter": "\n",
|
|
},
|
|
"children_delimiter": "",
|
|
}
|