Fix: create dataset with chunk_method or pipeline (#13814)

### What problem does this PR solve?

Allow create datasets with parse_type == 1/None and chunk_method, or
parse_type == 2 and pipeline_id.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Lynn
2026-03-26 20:43:53 +08:00
committed by GitHub
parent 6a4a9debd2
commit 8d4a3d0dfe
2 changed files with 19 additions and 21 deletions

View File

@ -217,6 +217,10 @@ async def update_dataset(tenant_id: str, dataset_id: str, req: dict):
elif "parser_config" in req and not req["parser_config"]:
del req["parser_config"]
if kb.pipeline_id and req.get("parser_id") and not req.get("pipeline_id"):
# shift to use parser_id, delete old pipeline_id
req["pipeline_id"] = ""
if "name" in req and req["name"].lower() != kb.name.lower():
exists = KnowledgebaseService.get_or_none(name=req["name"], tenant_id=tenant_id,
status=StatusEnum.VALID.value)
@ -245,6 +249,8 @@ async def update_dataset(tenant_id: str, dataset_id: str, req: dict):
from rag.nlp import search
settings.docStoreConn.update({"exists": PAGERANK_FLD}, {"remove": PAGERANK_FLD},
search.index_name(kb.tenant_id), kb.id)
if "parse_type" in req:
del req["parse_type"]
if not KnowledgebaseService.update_by_id(kb.id, req):
return False, "Update dataset error.(Database error)"

View File

@ -397,9 +397,9 @@ class CreateDatasetReq(Base):
description: Annotated[str | None, Field(default=None, max_length=65535)]
embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
chunk_method: Annotated[str | None, Field(default=None, serialization_alias="parser_id")]
parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)]
pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")]
chunk_method: Annotated[str | None, Field(default=None, serialization_alias="parser_id")]
parser_config: Annotated[ParserConfig | None, Field(default=None)]
auto_metadata_config: Annotated[AutoMetadataConfig | None, Field(default=None)]
ext: Annotated[dict, Field(default={})]
@ -409,16 +409,7 @@ class CreateDatasetReq(Base):
def handle_pipeline_id(cls, v: str | None, info: ValidationInfo):
if v is None:
return v
if info.data.get("chunk_method") is not None and isinstance(v, str):
v = None
return v
@field_validator("parse_type", mode="before")
@classmethod
def handle_parse_type(cls, v: int | None, info: ValidationInfo):
if v is None:
return v
if info.data.get("chunk_method") is not None and isinstance(v, int):
if info.data.get("parse_type", 0) == 1:
v = None
return v
@ -633,11 +624,11 @@ class CreateDatasetReq(Base):
# Both provided → allow pipeline mode
return self
# parser_id provided (valid): MUST NOT have parse_type or pipeline_id
# parser_id provided (valid): parse_type MUST be one of [None, 1], and MUST NOT have pipeline_id
if isinstance(self.chunk_method, str):
if self.parse_type is not None or self.pipeline_id is not None:
invalid = []
if self.parse_type is not None:
invalid = []
if self.parse_type not in [None, 1] or self.pipeline_id is not None:
if self.parse_type not in [None, 1]:
invalid.append("parse_type")
if self.pipeline_id is not None:
invalid.append("pipeline_id")
@ -650,20 +641,21 @@ class CreateDatasetReq(Base):
@field_validator("chunk_method", mode="wrap")
@classmethod
def validate_chunk_method(cls, v: Any, handler) -> Any:
def validate_chunk_method(cls, v: Any, handler, info: ValidationInfo) -> Any:
"""Wrap validation to unify error messages, including type errors (e.g. list)."""
allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag", "resume"}
allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table",
"tag", "resume"}
error_msg = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table', 'tag' or 'resume'"
# Omitted field: handler won't be invoked (wrap still gets value); None treated as explicit invalid
if v is None:
raise PydanticCustomError("literal_error", error_msg)
try:
# Run inner validation (type checking)
result = handler(v)
except Exception:
raise PydanticCustomError("literal_error", error_msg)
# Omitted field: handler won't be invoked (wrap still gets value); None treated as explicit invalid
if not result and not info.data.get("pipeline_id", None):
raise PydanticCustomError("literal_error", error_msg)
# After handler, enforce enumeration
if not isinstance(result, str) or result == "" or result not in allowed:
if result and result not in allowed:
raise PydanticCustomError("literal_error", error_msg)
return result