diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index e6ceb66e6..c7dc45b00 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -13,401 +13,35 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import base64 -import datetime import json -import logging -import re -import xxhash + from quart import request -from api.db.services.document_service import DocumentService +from api.apps import current_user, login_required +from api.db.joint_services.tenant_model_service import ( + get_model_config_by_id, + get_model_config_by_type_and_name, + get_tenant_default_model_by_type, +) from api.db.services.doc_metadata_service import DocMetadataService -from api.utils.image_utils import store_chunk_image +from api.db.services.document_service import DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle -from common.metadata_utils import apply_meta_data_filter from api.db.services.search_service import SearchService from api.db.services.user_service import UserTenantService -from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_tenant_default_model_by_type, get_model_config_by_type_and_name from api.utils.api_utils import ( get_data_error_result, get_json_result, + get_request_json, server_error_response, validate_request, - get_request_json, ) -from common.misc_utils import thread_pool_exec -from common.tag_feature_utils import validate_tag_features -from rag.app.qa import beAdoc, rmPrefix -from rag.app.tag import label_question -from rag.nlp import rag_tokenizer, search -from rag.prompts.generator import cross_languages, keyword_extraction -from common.string_utils import is_content_empty, remove_redundant_spaces -from common.constants import RetCode, LLMType, ParserType, PAGERANK_FLD from common import settings -from api.apps import login_required, current_user - -@manager.route('/list', methods=['POST']) # noqa: F821 -@login_required -@validate_request("doc_id") -async def list_chunk(): - req = await get_request_json() - doc_id = req["doc_id"] - page = int(req.get("page", 1)) - size = int(req.get("size", 30)) - question = req.get("keywords", "") - try: - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - if not tenant_id: - return get_data_error_result(message="Tenant not found!") - e, doc = DocumentService.get_by_id(doc_id) - if not e: - return get_data_error_result(message="Document not found!") - kb_ids = KnowledgebaseService.get_kb_ids(tenant_id) - query = { - "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True - } - if "available_int" in req: - query["available_int"] = int(req["available_int"]) - sres = await settings.retriever.search(query, search.index_name(tenant_id), kb_ids, highlight=["content_ltks"]) - res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} - for id in sres.ids: - d = { - "chunk_id": id, - "content_with_weight": remove_redundant_spaces(sres.highlight[id]) if question and id in sres.highlight else sres.field[ - id].get( - "content_with_weight", ""), - "doc_id": sres.field[id]["doc_id"], - "docnm_kwd": sres.field[id]["docnm_kwd"], - "important_kwd": sres.field[id].get("important_kwd", []), - "question_kwd": sres.field[id].get("question_kwd", []), - "image_id": sres.field[id].get("img_id", ""), - "available_int": int(sres.field[id].get("available_int", 1)), - "positions": sres.field[id].get("position_int", []), - "doc_type_kwd": sres.field[id].get("doc_type_kwd") - } - assert isinstance(d["positions"], list) - assert len(d["positions"]) == 0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5) - res["chunks"].append(d) - return get_json_result(data=res) - except Exception as e: - if str(e).find("not_found") > 0: - return get_json_result(data=False, message='No chunk found!', - code=RetCode.DATA_ERROR) - return server_error_response(e) - - -@manager.route('/get', methods=['GET']) # noqa: F821 -@login_required -def get(): - chunk_id = request.args["chunk_id"] - try: - chunk = None - tenants = UserTenantService.query(user_id=current_user.id) - if not tenants: - return get_data_error_result(message="Tenant not found!") - for tenant in tenants: - kb_ids = KnowledgebaseService.get_kb_ids(tenant.tenant_id) - chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant.tenant_id), kb_ids) - if chunk: - break - if chunk is None: - return server_error_response(Exception("Chunk not found")) - - k = [] - for n in chunk.keys(): - if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): - k.append(n) - for n in k: - del chunk[n] - - return get_json_result(data=chunk) - except Exception as e: - if str(e).find("NotFoundError") >= 0: - return get_json_result(data=False, message='Chunk not found!', - code=RetCode.DATA_ERROR) - return server_error_response(e) - - -@manager.route('/set', methods=['POST']) # noqa: F821 -@login_required -@validate_request("doc_id", "chunk_id", "content_with_weight") -async def set(): - req = await get_request_json() - content_with_weight = req["content_with_weight"] - if not isinstance(content_with_weight, (str, bytes)): - raise TypeError("expected string or bytes-like object") - if isinstance(content_with_weight, bytes): - content_with_weight = content_with_weight.decode("utf-8", errors="ignore") - if is_content_empty(content_with_weight): - return get_data_error_result(message="`content_with_weight` is required") - d = { - "id": req["chunk_id"], - "content_with_weight": content_with_weight} - d["content_ltks"] = rag_tokenizer.tokenize(content_with_weight) - d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - if "important_kwd" in req: - if not isinstance(req["important_kwd"], list): - return get_data_error_result(message="`important_kwd` should be a list") - d["important_kwd"] = req["important_kwd"] - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) - if "question_kwd" in req: - if not isinstance(req["question_kwd"], list): - return get_data_error_result(message="`question_kwd` should be a list") - d["question_kwd"] = req["question_kwd"] - d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"])) - if "tag_kwd" in req: - if not isinstance(req["tag_kwd"], list): - return get_data_error_result(message="`tag_kwd` should be a list") - if not all(isinstance(t, str) for t in req["tag_kwd"]): - return get_data_error_result(message="`tag_kwd` must be a list of strings") - d["tag_kwd"] = req["tag_kwd"] - if "tag_feas" in req: - try: - d["tag_feas"] = validate_tag_features(req["tag_feas"]) - except ValueError as exc: - return get_data_error_result(message=f"`tag_feas` {exc}") - if "available_int" in req: - d["available_int"] = req["available_int"] - - try: - def _set_sync(): - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - if not tenant_id: - return get_data_error_result(message="Tenant not found!") - - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - - tenant_embd_id = DocumentService.get_tenant_embd_id(req["doc_id"]) - if tenant_embd_id: - embd_model_config = get_model_config_by_id(tenant_embd_id) - else: - embd_id = DocumentService.get_embd_id(req["doc_id"]) - if embd_id: - embd_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING, embd_id) - else: - embd_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.EMBEDDING) - embd_mdl = LLMBundle(tenant_id, embd_model_config) - - _d = d - if doc.parser_id == ParserType.QA: - arr = [ - t for t in re.split( - r"[\n\t]", - req["content_with_weight"]) if len(t) > 1] - q, a = rmPrefix(arr[0]), rmPrefix("\n".join(arr[1:])) - _d = beAdoc(d, q, a, not any( - [rag_tokenizer.is_chinese(t) for t in q + a])) - - v, c = embd_mdl.encode([doc.name, content_with_weight if not _d.get("question_kwd") else "\n".join(_d["question_kwd"])]) - v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] - _d["q_%d_vec" % len(v)] = v.tolist() - settings.docStoreConn.update({"id": req["chunk_id"]}, _d, search.index_name(tenant_id), doc.kb_id) - - # update image - image_base64 = req.get("image_base64", None) - img_id = req.get("img_id", "") - if image_base64 and img_id and "-" in img_id: - bkt, name = img_id.split("-", 1) - image_binary = base64.b64decode(image_base64) - settings.STORAGE_IMPL.put(bkt, name, image_binary) - return get_json_result(data=True) - - return await thread_pool_exec(_set_sync) - except Exception as e: - return server_error_response(e) - - -@manager.route('/switch', methods=['POST']) # noqa: F821 -@login_required -@validate_request("chunk_ids", "available_int", "doc_id") -async def switch(): - req = await get_request_json() - try: - def _switch_sync(): - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - for cid in req["chunk_ids"]: - if not settings.docStoreConn.update({"id": cid}, - {"available_int": int(req["available_int"])}, - search.index_name(DocumentService.get_tenant_id(req["doc_id"])), - doc.kb_id): - return get_data_error_result(message="Index updating failure") - return get_json_result(data=True) - - return await thread_pool_exec(_switch_sync) - except Exception as e: - return server_error_response(e) - - -@manager.route('/rm', methods=['POST']) # noqa: F821 -@login_required -@validate_request("doc_id") -async def rm(): - req = await get_request_json() - try: - def _rm_sync(): - deleted_chunk_ids = req.get("chunk_ids") - if isinstance(deleted_chunk_ids, list): - unique_chunk_ids = list(dict.fromkeys(deleted_chunk_ids)) - has_ids = len(unique_chunk_ids) > 0 - elif deleted_chunk_ids is not None: - unique_chunk_ids = [deleted_chunk_ids] - has_ids = deleted_chunk_ids not in (None, "") - else: - unique_chunk_ids = [] - has_ids = False - if not has_ids: - if req.get("delete_all") is True: - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - # Clean up storage assets while index rows still exist for discovery - DocumentService.delete_chunk_images(doc, tenant_id) - condition = {"doc_id": req["doc_id"]} - try: - deleted_count = settings.docStoreConn.delete(condition, search.index_name(tenant_id), doc.kb_id) - except Exception: - return get_data_error_result(message="Chunk deleting failure") - if deleted_count > 0: - DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, deleted_count, 0) - return get_json_result(data=True) - return get_json_result(data=True) - - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - condition = {"id": req["chunk_ids"], "doc_id": req["doc_id"]} - try: - deleted_count = settings.docStoreConn.delete(condition, - search.index_name(DocumentService.get_tenant_id(req["doc_id"])), - doc.kb_id) - except Exception: - return get_data_error_result(message="Chunk deleting failure") - if has_ids and deleted_count == 0: - return get_data_error_result(message="Index updating failure") - if deleted_count > 0 and deleted_count < len(unique_chunk_ids): - deleted_count += settings.docStoreConn.delete({"doc_id": req["doc_id"]}, - search.index_name(DocumentService.get_tenant_id(req["doc_id"])), - doc.kb_id) - chunk_number = deleted_count - DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0) - for cid in deleted_chunk_ids: - if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid): - settings.STORAGE_IMPL.rm(doc.kb_id, cid) - return get_json_result(data=True) - - return await thread_pool_exec(_rm_sync) - except Exception as e: - return server_error_response(e) - - -@manager.route('/create', methods=['POST']) # noqa: F821 -@login_required -@validate_request("doc_id", "content_with_weight") -async def create(): - req = await get_request_json() - req_id = request.headers.get("X-Request-ID") - chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest() - d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), - "content_with_weight": req["content_with_weight"]} - d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["important_kwd"] = req.get("important_kwd", []) - if not isinstance(d["important_kwd"], list): - return get_data_error_result(message="`important_kwd` is required to be a list") - d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"])) - d["question_kwd"] = req.get("question_kwd", []) - if not isinstance(d["question_kwd"], list): - return get_data_error_result(message="`question_kwd` is required to be a list") - d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"])) - d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] - d["create_timestamp_flt"] = datetime.datetime.now().timestamp() - if "tag_kwd" in req: - if not isinstance(req["tag_kwd"], list): - return get_data_error_result(message="`tag_kwd` is required to be a list") - if not all(isinstance(t, str) for t in req["tag_kwd"]): - return get_data_error_result(message="`tag_kwd` must be a list of strings") - d["tag_kwd"] = req["tag_kwd"] - if "tag_feas" in req: - try: - d["tag_feas"] = validate_tag_features(req["tag_feas"]) - except ValueError as exc: - return get_data_error_result(message=f"`tag_feas` {exc}") - image_base64 = req.get("image_base64", None) - - try: - def _log_response(resp, code, message): - logging.info( - "chunk_create response req_id=%s status=%s code=%s message=%s", - req_id, - getattr(resp, "status_code", None), - code, - message, - ) - - def _create_sync(): - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - resp = get_data_error_result(message="Document not found!") - _log_response(resp, RetCode.DATA_ERROR, "Document not found!") - return resp - d["kb_id"] = [doc.kb_id] - d["docnm_kwd"] = doc.name - d["title_tks"] = rag_tokenizer.tokenize(doc.name) - d["doc_id"] = doc.id - - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - if not tenant_id: - resp = get_data_error_result(message="Tenant not found!") - _log_response(resp, RetCode.DATA_ERROR, "Tenant not found!") - return resp - - e, kb = KnowledgebaseService.get_by_id(doc.kb_id) - if not e: - resp = get_data_error_result(message="Knowledgebase not found!") - _log_response(resp, RetCode.DATA_ERROR, "Knowledgebase not found!") - return resp - if kb.pagerank: - d[PAGERANK_FLD] = kb.pagerank - - tenant_embd_id = DocumentService.get_tenant_embd_id(req["doc_id"]) - if tenant_embd_id: - embd_model_config = get_model_config_by_id(tenant_embd_id) - else: - embd_id = DocumentService.get_embd_id(req["doc_id"]) - if embd_id: - embd_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING, embd_id) - else: - embd_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.EMBEDDING) - embd_mdl = LLMBundle(tenant_id, embd_model_config) - - if image_base64: - d["img_id"] = "{}-{}".format(doc.kb_id, chunck_id) - d["doc_type_kwd"] = "image" - - v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) - v = 0.1 * v[0] + 0.9 * v[1] - d["q_%d_vec" % len(v)] = v.tolist() - settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id) - - if image_base64: - store_chunk_image(doc.kb_id, chunck_id, base64.b64decode(image_base64)) - - DocumentService.increment_chunk_num( - doc.id, doc.kb_id, c, 1, 0) - resp = get_json_result(data={"chunk_id": chunck_id, "image_id": d.get("img_id", "")}) - _log_response(resp, RetCode.SUCCESS, "success") - return resp - - return await thread_pool_exec(_create_sync) - except Exception as e: - logging.info("chunk_create exception req_id=%s error=%r", req_id, e) - return server_error_response(e) +from common.constants import LLMType, RetCode +from common.metadata_utils import apply_meta_data_filter +from rag.app.tag import label_question +from rag.nlp import search +from rag.prompts.generator import cross_languages, keyword_extraction @manager.route('/retrieval_test', methods=['POST']) # noqa: F821 diff --git a/api/apps/restful_apis/chunk_api.py b/api/apps/restful_apis/chunk_api.py new file mode 100644 index 000000000..13b5cb580 --- /dev/null +++ b/api/apps/restful_apis/chunk_api.py @@ -0,0 +1,445 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import base64 +import datetime +import re + +import xxhash +from pydantic import BaseModel, Field, validator +from quart import request + +from api.apps import login_required +from api.db.joint_services.tenant_model_service import ( + get_model_config_by_id, + get_model_config_by_type_and_name, +) +from api.db.services.document_service import DocumentService +from api.db.services.knowledgebase_service import KnowledgebaseService +from api.db.services.tenant_llm_service import TenantLLMService +from api.utils.api_utils import ( + add_tenant_id_to_kwargs, + check_duplicate_ids, + get_error_data_result, + get_request_json, + get_result, + server_error_response, +) +from api.utils.image_utils import store_chunk_image +from common import settings +from common.constants import LLMType, ParserType, RetCode +from common.misc_utils import thread_pool_exec +from common.string_utils import is_content_empty, remove_redundant_spaces +from common.tag_feature_utils import validate_tag_features +from rag.app.qa import beAdoc, rmPrefix +from rag.nlp import rag_tokenizer, search + + +class Chunk(BaseModel): + id: str = "" + content: str = "" + document_id: str = "" + docnm_kwd: str = "" + important_keywords: list = Field(default_factory=list) + tag_kwd: list = Field(default_factory=list) + questions: list = Field(default_factory=list) + question_tks: str = "" + image_id: str = "" + available: bool = True + positions: list[list[int]] = Field(default_factory=list) + + @validator("positions") + def validate_positions(cls, value): + for sublist in value: + if len(sublist) != 5: + raise ValueError("Each sublist in positions must have a length of 5") + return value + + +def _map_doc(doc): + key_mapping = { + "chunk_num": "chunk_count", + "kb_id": "dataset_id", + "token_num": "token_count", + "parser_id": "chunk_method", + } + run_mapping = { + "0": "UNSTART", + "1": "RUNNING", + "2": "CANCEL", + "3": "DONE", + "4": "FAIL", + } + renamed_doc = {} + for key, value in doc.to_dict().items(): + renamed_doc[key_mapping.get(key, key)] = value + if key == "run": + renamed_doc["run"] = run_mapping.get(str(value)) + return renamed_doc + + +def _strip_chunk_runtime_fields(chunk): + for name in [name for name in chunk.keys() if re.search(r"(_vec$|_sm_|_tks|_ltks)", name)]: + del chunk[name] + return chunk + + +@manager.route("/datasets//documents//chunks", methods=["GET"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def list_chunks(tenant_id, dataset_id, document_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + doc = DocumentService.query(id=document_id, kb_id=dataset_id) + if not doc: + return get_error_data_result(message=f"You don't own the document {document_id}.") + doc = doc[0] + req = request.args + page = int(req.get("page", 1)) + size = int(req.get("page_size", 30)) + question = req.get("keywords", "") + query = { + "doc_ids": [document_id], + "page": page, + "size": size, + "question": question, + "sort": True, + } + if "available" in req: + query["available_int"] = 1 if req["available"] == "true" else 0 + + res = {"total": 0, "chunks": [], "doc": _map_doc(doc)} + if req.get("id"): + chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id]) + if not chunk: + return get_result(message=f"Chunk not found: {dataset_id}/{req.get('id')}", code=RetCode.DATA_ERROR) + if str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): + return get_result(message=f"Chunk not found: {dataset_id}/{req.get('id')}", code=RetCode.DATA_ERROR) + _strip_chunk_runtime_fields(chunk) + res["total"] = 1 + final_chunk = { + "id": chunk.get("id", chunk.get("chunk_id")), + "content": chunk["content_with_weight"], + "document_id": chunk.get("doc_id", chunk.get("document_id")), + "docnm_kwd": chunk["docnm_kwd"], + "important_keywords": chunk.get("important_kwd", []), + "questions": chunk.get("question_kwd", []), + "dataset_id": chunk.get("kb_id", chunk.get("dataset_id")), + "image_id": chunk.get("img_id", ""), + "available": bool(chunk.get("available_int", 1)), + "positions": chunk.get("position_int", []), + "tag_kwd": chunk.get("tag_kwd", []), + "tag_feas": chunk.get("tag_feas", {}), + } + res["chunks"].append(final_chunk) + _ = Chunk(**final_chunk) + elif settings.docStoreConn.index_exist(search.index_name(tenant_id), dataset_id): + sres = await settings.retriever.search( + query, + search.index_name(tenant_id), + [dataset_id], + emb_mdl=None, + highlight=True, + ) + res["total"] = sres.total + for chunk_id in sres.ids: + d = { + "id": chunk_id, + "content": ( + remove_redundant_spaces(sres.highlight[chunk_id]) + if question and chunk_id in sres.highlight + else sres.field[chunk_id].get("content_with_weight", "") + ), + "document_id": sres.field[chunk_id]["doc_id"], + "docnm_kwd": sres.field[chunk_id]["docnm_kwd"], + "important_keywords": sres.field[chunk_id].get("important_kwd", []), + "tag_kwd": sres.field[chunk_id].get("tag_kwd", []), + "questions": sres.field[chunk_id].get("question_kwd", []), + "dataset_id": sres.field[chunk_id].get("kb_id", sres.field[chunk_id].get("dataset_id")), + "image_id": sres.field[chunk_id].get("img_id", ""), + "available": bool(int(sres.field[chunk_id].get("available_int", "1"))), + "positions": sres.field[chunk_id].get("position_int", []), + } + res["chunks"].append(d) + _ = Chunk(**d) + return get_result(data=res) + + +@manager.route("/datasets//documents//chunks/", methods=["GET"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def get_chunk(tenant_id, dataset_id, document_id, chunk_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + doc = DocumentService.query(id=document_id, kb_id=dataset_id) + if not doc: + return get_error_data_result(message=f"You don't own the document {document_id}.") + try: + chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id]) + if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): + return get_result(data=False, message="Chunk not found!", code=RetCode.DATA_ERROR) + return get_result(data=_strip_chunk_runtime_fields(chunk)) + except Exception as e: + if str(e).find("NotFoundError") >= 0: + return get_result(data=False, message="Chunk not found!", code=RetCode.DATA_ERROR) + return server_error_response(e) + + +@manager.route("/datasets//documents//chunks", methods=["POST"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def add_chunk(tenant_id, dataset_id, document_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + doc = DocumentService.query(id=document_id, kb_id=dataset_id) + if not doc: + return get_error_data_result(message=f"You don't own the document {document_id}.") + doc = doc[0] + req = await get_request_json() + if is_content_empty(req.get("content")): + return get_error_data_result(message="`content` is required") + if "important_keywords" in req and not isinstance(req["important_keywords"], list): + return get_error_data_result("`important_keywords` is required to be a list") + if "questions" in req and not isinstance(req["questions"], list): + return get_error_data_result("`questions` is required to be a list") + + chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest() + d = { + "id": chunk_id, + "content_ltks": rag_tokenizer.tokenize(req["content"]), + "content_with_weight": req["content"], + } + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) + d["important_kwd"] = req.get("important_keywords", []) + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", []))) + d["question_kwd"] = [str(q).strip() for q in req.get("questions", []) if str(q).strip()] + d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("questions", []))) + d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] + d["create_timestamp_flt"] = datetime.datetime.now().timestamp() + d["kb_id"] = dataset_id + d["docnm_kwd"] = doc.name + d["doc_id"] = document_id + + if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_error_data_result("`tag_kwd` is required to be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_error_data_result("`tag_kwd` must be a list of strings") + d["tag_kwd"] = req["tag_kwd"] + if "tag_feas" in req: + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_error_data_result(f"`tag_feas` {exc}") + + image_base64 = req.get("image_base64") + if image_base64: + d["img_id"] = f"{dataset_id}-{chunk_id}" + d["doc_type_kwd"] = "image" + + tenant_embd_id = DocumentService.get_tenant_embd_id(document_id) + if tenant_embd_id: + model_config = get_model_config_by_id(tenant_embd_id) + else: + embd_id = DocumentService.get_embd_id(document_id) + model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) + embd_mdl = TenantLLMService.model_instance(model_config) + v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) + v = 0.1 * v[0] + 0.9 * v[1] + d[f"q_{len(v)}_vec"] = v.tolist() + settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id) + + if image_base64: + store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64)) + + DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0) + key_mapping = { + "id": "id", + "content_with_weight": "content", + "doc_id": "document_id", + "important_kwd": "important_keywords", + "tag_kwd": "tag_kwd", + "question_kwd": "questions", + "kb_id": "dataset_id", + "create_timestamp_flt": "create_timestamp", + "create_time": "create_time", + "document_keyword": "document", + "img_id": "image_id", + } + renamed_chunk = {new_key: d[key] for key, new_key in key_mapping.items() if key in d} + _ = Chunk(**renamed_chunk) + return get_result(data={"chunk": renamed_chunk}) + + +@manager.route("/datasets//documents//chunks", methods=["DELETE"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def rm_chunk(tenant_id, dataset_id, document_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + docs = DocumentService.query(id=document_id, kb_id=dataset_id) + if not docs: + return get_error_data_result(message=f"You don't own the document {document_id}.") + req = await get_request_json() + if not req: + return get_result() + + chunk_ids = req.get("chunk_ids") + if not chunk_ids: + if req.get("delete_all") is True: + doc = docs[0] + DocumentService.delete_chunk_images(doc, tenant_id) + chunk_number = settings.docStoreConn.delete({"doc_id": document_id}, search.index_name(tenant_id), dataset_id) + if chunk_number != 0: + DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0) + return get_result(message=f"deleted {chunk_number} chunks") + return get_result() + + unique_chunk_ids, duplicate_messages = check_duplicate_ids(chunk_ids, "chunk") + chunk_number = settings.docStoreConn.delete( + {"doc_id": document_id, "id": unique_chunk_ids}, + search.index_name(tenant_id), + dataset_id, + ) + if chunk_number != 0: + DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0) + if chunk_number != len(unique_chunk_ids): + if len(unique_chunk_ids) == 0: + return get_result(message=f"deleted {chunk_number} chunks") + return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(unique_chunk_ids)}") + if duplicate_messages: + return get_result( + message=f"Partially deleted {chunk_number} chunks with {len(duplicate_messages)} errors", + data={"success_count": chunk_number, "errors": duplicate_messages}, + ) + return get_result(message=f"deleted {chunk_number} chunks") + + +@manager.route("/datasets//documents//chunks/", methods=["PATCH"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + doc = DocumentService.query(id=document_id, kb_id=dataset_id) + if not doc: + return get_error_data_result(message=f"You don't own the document {document_id}.") + doc = doc[0] + chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id]) + if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): + return get_error_data_result(f"Can't find this chunk {chunk_id}") + req = await get_request_json() + content = req.get("content") + if content is not None: + if is_content_empty(content): + return get_error_data_result(message="`content` is required") + else: + content = chunk.get("content_with_weight", "") + d = {"id": chunk_id, "content_with_weight": content} + d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"]) + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) + if "important_keywords" in req: + if not isinstance(req["important_keywords"], list): + return get_error_data_result("`important_keywords` should be a list") + d["important_kwd"] = req.get("important_keywords", []) + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) + if "questions" in req: + if not isinstance(req["questions"], list): + return get_error_data_result("`questions` should be a list") + d["question_kwd"] = [str(q).strip() for q in req.get("questions", []) if str(q).strip()] + d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"])) + if "available" in req: + d["available_int"] = int(req["available"]) + if "positions" in req: + if not isinstance(req["positions"], list): + return get_error_data_result("`positions` should be a list") + d["position_int"] = req["positions"] + if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_error_data_result("`tag_kwd` should be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_error_data_result("`tag_kwd` must be a list of strings") + d["tag_kwd"] = req["tag_kwd"] + if "tag_feas" in req: + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_error_data_result(f"`tag_feas` {exc}") + image_base64 = req.get("image_base64") + if image_base64: + d["img_id"] = f"{dataset_id}-{chunk_id}" + d["doc_type_kwd"] = "image" + + tenant_embd_id = DocumentService.get_tenant_embd_id(document_id) + if tenant_embd_id: + model_config = get_model_config_by_id(tenant_embd_id) + else: + embd_id = DocumentService.get_embd_id(document_id) + model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) + embd_mdl = TenantLLMService.model_instance(model_config) + if doc.parser_id == ParserType.QA: + arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1] + if len(arr) != 2: + return get_error_data_result(message="Q&A must be separated by TAB/ENTER key.") + q, a = rmPrefix(arr[0]), rmPrefix(arr[1]) + d = beAdoc(d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])) + + v, _ = embd_mdl.encode( + [ + doc.name, + d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"]), + ] + ) + v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] + d[f"q_{len(v)}_vec"] = v.tolist() + settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id) + if image_base64: + store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64)) + return get_result() + + +@manager.route("/datasets//documents//chunks", methods=["PATCH"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def switch_chunks(tenant_id, dataset_id, document_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + req = await get_request_json() + if not req.get("chunk_ids"): + return get_error_data_result(message="`chunk_ids` is required.") + if "available_int" not in req and "available" not in req: + return get_error_data_result(message="`available_int` or `available` is required.") + available_int = int(req["available_int"]) if "available_int" in req else (1 if req.get("available") else 0) + + try: + def _switch_sync(): + e, doc = DocumentService.get_by_id(document_id) + if not e: + return get_error_data_result(message="Document not found!") + if not doc or str(doc.kb_id) != str(dataset_id): + return get_error_data_result(message="Document not found!") + for cid in req["chunk_ids"]: + if not settings.docStoreConn.update( + {"id": cid}, + {"available_int": available_int}, + search.index_name(tenant_id), + doc.kb_id, + ): + return get_error_data_result(message="Index updating failure") + return get_result(data=True) + + return await thread_pool_exec(_switch_sync) + except Exception as e: + return server_error_response(e) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 067796ada..57060c2ab 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -13,12 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import datetime -import re from io import BytesIO -import xxhash -from pydantic import BaseModel, Field, validator from quart import request, send_file from api.db.db_models import APIToken, Document, Task @@ -31,42 +27,16 @@ from api.db.services.llm_service import LLMBundle from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks from api.db.services.tenant_llm_service import TenantLLMService from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required -from api.utils.image_utils import store_chunk_image from common import settings -from common.constants import LLMType, ParserType, RetCode, TaskStatus +from common.constants import LLMType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter -from common.misc_utils import thread_pool_exec -from common.string_utils import is_content_empty, remove_redundant_spaces -from common.tag_feature_utils import validate_tag_features -from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question -from rag.nlp import rag_tokenizer, search +from rag.nlp import search from rag.prompts.generator import cross_languages, keyword_extraction MAXIMUM_OF_UPLOADING_FILES = 256 -class Chunk(BaseModel): - id: str = "" - content: str = "" - document_id: str = "" - docnm_kwd: str = "" - important_keywords: list = Field(default_factory=list) - tag_kwd: list = Field(default_factory=list) - questions: list = Field(default_factory=list) - question_tks: str = "" - image_id: str = "" - available: bool = True - positions: list[list[int]] = Field(default_factory=list) - - @validator("positions") - def validate_positions(cls, value): - for sublist in value: - if len(sublist) != 5: - raise ValueError("Each sublist in positions must have a length of 5") - return value - - @manager.route("/datasets//documents/", methods=["GET"]) # noqa: F821 @token_required async def download(tenant_id, dataset_id, document_id): @@ -329,642 +299,6 @@ async def stop_parsing(tenant_id, dataset_id): return get_result() -@manager.route("/datasets//documents//chunks", methods=["GET"]) # noqa: F821 -@token_required -async def list_chunks(tenant_id, dataset_id, document_id): - """ - List chunks of a document. - --- - tags: - - Chunks - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document. - - in: query - name: page - type: integer - required: false - default: 1 - description: Page number. - - in: query - name: page_size - type: integer - required: false - default: 30 - description: Number of items per page. - - in: query - name: id - type: string - required: false - default: "" - description: Chunk id. - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: List of chunks. - schema: - type: object - properties: - total: - type: integer - description: Total number of chunks. - chunks: - type: array - items: - type: object - properties: - id: - type: string - description: Chunk ID. - content: - type: string - description: Chunk content. - document_id: - type: string - description: ID of the document. - important_keywords: - type: array - items: - type: string - description: Important keywords. - tag_kwd: - type: array - items: - type: string - description: Tag keywords. - image_id: - type: string - description: Image ID associated with the chunk. - doc: - type: object - description: Document details. - """ - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") - doc = DocumentService.query(id=document_id, kb_id=dataset_id) - if not doc: - return get_error_data_result(message=f"You don't own the document {document_id}.") - doc = doc[0] - req = request.args - doc_id = document_id - page = int(req.get("page", 1)) - size = int(req.get("page_size", 30)) - question = req.get("keywords", "") - query = { - "doc_ids": [doc_id], - "page": page, - "size": size, - "question": question, - "sort": True, - } - if "available" in req: - query["available_int"] = 1 if req["available"] == "true" else 0 - key_mapping = { - "chunk_num": "chunk_count", - "kb_id": "dataset_id", - "token_num": "token_count", - "parser_id": "chunk_method", - } - run_mapping = { - "0": "UNSTART", - "1": "RUNNING", - "2": "CANCEL", - "3": "DONE", - "4": "FAIL", - } - doc = doc.to_dict() - renamed_doc = {} - for key, value in doc.items(): - new_key = key_mapping.get(key, key) - renamed_doc[new_key] = value - if key == "run": - renamed_doc["run"] = run_mapping.get(str(value)) - - res = {"total": 0, "chunks": [], "doc": renamed_doc} - if req.get("id"): - chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id]) - if not chunk: - return get_result(message=f"Chunk not found: {dataset_id}/{req.get('id')}", code=RetCode.NOT_FOUND) - k = [] - for n in chunk.keys(): - if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): - k.append(n) - for n in k: - del chunk[n] - if not chunk: - return get_error_data_result(f"Chunk `{req.get('id')}` not found.") - res["total"] = 1 - final_chunk = { - "id": chunk.get("id", chunk.get("chunk_id")), - "content": chunk["content_with_weight"], - "document_id": chunk.get("doc_id", chunk.get("document_id")), - "docnm_kwd": chunk["docnm_kwd"], - "important_keywords": chunk.get("important_kwd", []), - "questions": chunk.get("question_kwd", []), - "dataset_id": chunk.get("kb_id", chunk.get("dataset_id")), - "image_id": chunk.get("img_id", ""), - "available": bool(chunk.get("available_int", 1)), - "positions": chunk.get("position_int", []), - "tag_kwd": chunk.get("tag_kwd", []), - "tag_feas": chunk.get("tag_feas", {}), - } - res["chunks"].append(final_chunk) - _ = Chunk(**final_chunk) - - elif settings.docStoreConn.index_exist(search.index_name(tenant_id), dataset_id): - sres = await settings.retriever.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None, highlight=True) - res["total"] = sres.total - for id in sres.ids: - d = { - "id": id, - "content": (remove_redundant_spaces(sres.highlight[id]) if question and id in sres.highlight else sres.field[id].get("content_with_weight", "")), - "document_id": sres.field[id]["doc_id"], - "docnm_kwd": sres.field[id]["docnm_kwd"], - "important_keywords": sres.field[id].get("important_kwd", []), - "tag_kwd": sres.field[id].get("tag_kwd", []), - "questions": sres.field[id].get("question_kwd", []), - "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")), - "image_id": sres.field[id].get("img_id", ""), - "available": bool(int(sres.field[id].get("available_int", "1"))), - "positions": sres.field[id].get("position_int", []), - } - res["chunks"].append(d) - _ = Chunk(**d) # validate the chunk - return get_result(data=res) - - -@manager.route( # noqa: F821 - "/datasets//documents//chunks", methods=["POST"] -) -@token_required -async def add_chunk(tenant_id, dataset_id, document_id): - """ - Add a chunk to a document. - --- - tags: - - Chunks - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document. - - in: body - name: body - description: Chunk data. - required: true - schema: - type: object - properties: - content: - type: string - required: true - description: Content of the chunk. - important_keywords: - type: array - items: - type: string - description: Important keywords. - image_base64: - type: string - description: Base64-encoded image to associate with the chunk. - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: Chunk added successfully. - schema: - type: object - properties: - chunk: - type: object - properties: - id: - type: string - description: Chunk ID. - content: - type: string - description: Chunk content. - document_id: - type: string - description: ID of the document. - important_keywords: - type: array - items: - type: string - description: Important keywords. - """ - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") - doc = DocumentService.query(id=document_id, kb_id=dataset_id) - if not doc: - return get_error_data_result(message=f"You don't own the document {document_id}.") - doc = doc[0] - req = await get_request_json() - if is_content_empty(req.get("content")): - return get_error_data_result(message="`content` is required") - if "important_keywords" in req: - if not isinstance(req["important_keywords"], list): - return get_error_data_result("`important_keywords` is required to be a list") - if "questions" in req: - if not isinstance(req["questions"], list): - return get_error_data_result("`questions` is required to be a list") - chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest() - d = { - "id": chunk_id, - "content_ltks": rag_tokenizer.tokenize(req["content"]), - "content_with_weight": req["content"], - } - d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["important_kwd"] = req.get("important_keywords", []) - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", []))) - d["question_kwd"] = [str(q).strip() for q in req.get("questions", []) if str(q).strip()] - d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("questions", []))) - d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] - d["create_timestamp_flt"] = datetime.datetime.now().timestamp() - d["kb_id"] = dataset_id - d["docnm_kwd"] = doc.name - d["doc_id"] = document_id - if "tag_kwd" in req: - if not isinstance(req["tag_kwd"], list): - return get_error_data_result("`tag_kwd` is required to be a list") - if not all(isinstance(t, str) for t in req["tag_kwd"]): - return get_error_data_result("`tag_kwd` must be a list of strings") - d["tag_kwd"] = req["tag_kwd"] - if "tag_feas" in req: - try: - d["tag_feas"] = validate_tag_features(req["tag_feas"]) - except ValueError as exc: - return get_error_data_result(f"`tag_feas` {exc}") - import base64 - - image_base64 = req.get("image_base64", None) - if image_base64: - d["img_id"] = "{}-{}".format(dataset_id, chunk_id) - d["doc_type_kwd"] = "image" - - tenant_embd_id = DocumentService.get_tenant_embd_id(document_id) - if tenant_embd_id: - model_config = get_model_config_by_id(tenant_embd_id) - else: - embd_id = DocumentService.get_embd_id(document_id) - model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) - embd_mdl = TenantLLMService.model_instance(model_config) - v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) - v = 0.1 * v[0] + 0.9 * v[1] - d["q_%d_vec" % len(v)] = v.tolist() - settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id) - - if image_base64: - store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64)) - - DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0) - # rename keys - key_mapping = { - "id": "id", - "content_with_weight": "content", - "doc_id": "document_id", - "important_kwd": "important_keywords", - "tag_kwd": "tag_kwd", - "question_kwd": "questions", - "kb_id": "dataset_id", - "create_timestamp_flt": "create_timestamp", - "create_time": "create_time", - "document_keyword": "document", - "img_id": "image_id", - } - renamed_chunk = {} - for key, value in d.items(): - if key in key_mapping: - new_key = key_mapping.get(key, key) - renamed_chunk[new_key] = value - _ = Chunk(**renamed_chunk) # validate the chunk - return get_result(data={"chunk": renamed_chunk}) - # return get_result(data={"chunk_id": chunk_id}) - - -@manager.route( # noqa: F821 - "datasets//documents//chunks", methods=["DELETE"] -) -@token_required -async def rm_chunk(tenant_id, dataset_id, document_id): - """ - Remove chunks from a document. - --- - tags: - - Chunks - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document. - - in: body - name: body - description: Chunk removal parameters. - required: true - schema: - type: object - properties: - chunk_ids: - type: array - items: - type: string - description: | - List of chunk IDs to remove. - If omitted, `null`, or an empty array is provided, no chunks will be deleted. - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: Chunks removed successfully. - schema: - type: object - """ - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") - docs = DocumentService.get_by_ids([document_id]) - if not docs: - raise LookupError(f"Can't find the document with ID {document_id}!") - req = await get_request_json() - if not req: - return get_result() - - chunk_ids = req.get("chunk_ids") - if not chunk_ids: - if req.get("delete_all") is True: - doc = docs[0] - # Clean up storage assets while index rows still exist for discovery - DocumentService.delete_chunk_images(doc, tenant_id) - condition = {"doc_id": document_id} - chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id) - if chunk_number != 0: - DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0) - return get_result(message=f"deleted {chunk_number} chunks") - else: - return get_result() - - condition = {"doc_id": document_id} - unique_chunk_ids, duplicate_messages = check_duplicate_ids(chunk_ids, "chunk") - condition["id"] = unique_chunk_ids - chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id) - if chunk_number != 0: - DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0) - if chunk_number != len(unique_chunk_ids): - if len(unique_chunk_ids) == 0: - return get_result(message=f"deleted {chunk_number} chunks") - return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(unique_chunk_ids)}") - if duplicate_messages: - return get_result( - message=f"Partially deleted {chunk_number} chunks with {len(duplicate_messages)} errors", - data={"success_count": chunk_number, "errors": duplicate_messages}, - ) - return get_result(message=f"deleted {chunk_number} chunks") - - -@manager.route( # noqa: F821 - "/datasets//documents//chunks/", methods=["PUT"] -) -@token_required -async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): - """ - Update a chunk within a document. - --- - tags: - - Chunks - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document. - - in: path - name: chunk_id - type: string - required: true - description: ID of the chunk to update. - - in: body - name: body - description: Chunk update parameters. - required: true - schema: - type: object - properties: - content: - type: string - description: Updated content of the chunk. - important_keywords: - type: array - items: - type: string - description: Updated important keywords. - tag_kwd: - type: array - items: - type: string - description: Updated tag keywords. - available: - type: boolean - description: Availability status of the chunk. - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: Chunk updated successfully. - schema: - type: object - """ - chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id]) - if chunk is None: - return get_error_data_result(f"Can't find this chunk {chunk_id}") - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") - doc = DocumentService.query(id=document_id, kb_id=dataset_id) - if not doc: - return get_error_data_result(message=f"You don't own the document {document_id}.") - doc = doc[0] - req = await get_request_json() - content = req.get("content") - if content is not None: - if is_content_empty(content): - return get_error_data_result(message="`content` is required") - else: - content = chunk.get("content_with_weight", "") - d = {"id": chunk_id, "content_with_weight": content} - d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"]) - d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - if "important_keywords" in req: - if not isinstance(req["important_keywords"], list): - return get_error_data_result("`important_keywords` should be a list") - d["important_kwd"] = req.get("important_keywords", []) - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) - if "questions" in req: - if not isinstance(req["questions"], list): - return get_error_data_result("`questions` should be a list") - d["question_kwd"] = [str(q).strip() for q in req.get("questions", []) if str(q).strip()] - d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"])) - if "available" in req: - d["available_int"] = int(req["available"]) - if "positions" in req: - if not isinstance(req["positions"], list): - return get_error_data_result("`positions` should be a list") - d["position_int"] = req["positions"] - if "tag_kwd" in req: - if not isinstance(req["tag_kwd"], list): - return get_error_data_result("`tag_kwd` should be a list") - if not all(isinstance(t, str) for t in req["tag_kwd"]): - return get_error_data_result("`tag_kwd` must be a list of strings") - d["tag_kwd"] = req["tag_kwd"] - if "tag_feas" in req: - try: - d["tag_feas"] = validate_tag_features(req["tag_feas"]) - except ValueError as exc: - return get_error_data_result(f"`tag_feas` {exc}") - tenant_embd_id = DocumentService.get_tenant_embd_id(document_id) - if tenant_embd_id: - model_config = get_model_config_by_id(tenant_embd_id) - else: - embd_id = DocumentService.get_embd_id(document_id) - model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) - embd_mdl = TenantLLMService.model_instance(model_config) - if doc.parser_id == ParserType.QA: - arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1] - if len(arr) != 2: - return get_error_data_result(message="Q&A must be separated by TAB/ENTER key.") - q, a = rmPrefix(arr[0]), rmPrefix(arr[1]) - d = beAdoc(d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])) - - v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])]) - v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] - d["q_%d_vec" % len(v)] = v.tolist() - settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id) - return get_result() - - -@manager.route( # noqa: F821 - "/datasets//documents//chunks/switch", methods=["POST"] -) -@token_required -async def switch_chunks(tenant_id, dataset_id, document_id): - """ - Switch availability of specified chunks (same as chunk_app switch). - --- - tags: - - Chunks - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document. - - in: body - name: body - required: true - schema: - type: object - properties: - chunk_ids: - type: array - items: - type: string - description: List of chunk IDs to switch. - available_int: - type: integer - description: 1 for available, 0 for unavailable. - available: - type: boolean - description: Availability status (alternative to available_int). - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: Chunks availability switched successfully. - """ - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") - req = await get_request_json() - if not req.get("chunk_ids"): - return get_error_data_result(message="`chunk_ids` is required.") - if "available_int" not in req and "available" not in req: - return get_error_data_result(message="`available_int` or `available` is required.") - available_int = int(req["available_int"]) if "available_int" in req else (1 if req.get("available") else 0) - try: - - def _switch_sync(): - e, doc = DocumentService.get_by_id(document_id) - if not e: - return get_error_data_result(message="Document not found!") - if not doc or str(doc.kb_id) != str(dataset_id): - return get_error_data_result(message="Document not found!") - for cid in req["chunk_ids"]: - if not settings.docStoreConn.update( - {"id": cid}, - {"available_int": available_int}, - search.index_name(tenant_id), - doc.kb_id, - ): - return get_error_data_result(message="Index updating failure") - return get_result(data=True) - - return await thread_pool_exec(_switch_sync) - except Exception as e: - return server_error_response(e) - - @manager.route("/retrieval", methods=["POST"]) # noqa: F821 @token_required async def retrieval_test(tenant_id): diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 7326f997a..7c9fe84ef 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -85,17 +85,17 @@ curl --request POST \ ##### Request Parameters -- `model` (*Body parameter*) `string`, *Required* +- `model` (*Body parameter*) `string`, *Required* The model used to generate the response. The server will parse this automatically, so you can set it to any value for now. -- `messages` (*Body parameter*) `list[object]`, *Required* +- `messages` (*Body parameter*) `list[object]`, *Required* A list of historical chat messages used to generate the response. This must contain at least one message with the `user` role. -- `stream` (*Body parameter*) `boolean` +- `stream` (*Body parameter*) `boolean` Whether to receive the response as a stream. Set this to `false` explicitly if you prefer to receive the entire response in one go instead of as a stream. -- `extra_body` (*Body parameter*) `object` - Extra request parameters: +- `extra_body` (*Body parameter*) `object` + Extra request parameters: - `reference`: `boolean` - include reference in the final chunk (stream) or in the final message (non-stream). - `reference_metadata`: `object` - include document metadata in each reference chunk. - `include`: `boolean` - enable document metadata in reference chunks. @@ -218,16 +218,16 @@ curl --request POST \ ##### Request Parameters -- `model` (*Body parameter*) `string`, *Required* +- `model` (*Body parameter*) `string`, *Required* The model used to generate the response. The server will parse this automatically, so you can set it to any value for now. -- `messages` (*Body parameter*) `list[object]`, *Required* +- `messages` (*Body parameter*) `list[object]`, *Required* A list of historical chat messages used to generate the response. This must contain at least one message with the `user` role. -- `stream` (*Body parameter*) `boolean` +- `stream` (*Body parameter*) `boolean` Whether to receive the response as a stream. Set this to `false` explicitly if you prefer to receive the entire response in one go instead of as a stream. -- `session_id` (*Body parameter*) `string` +- `session_id` (*Body parameter*) `string` Agent session id. #### Response @@ -493,33 +493,33 @@ curl --request POST \ ##### Request parameters -- `"name"`: (*Body parameter*), `string`, *Required* - The unique name of the dataset to create. It must adhere to the following requirements: +- `"name"`: (*Body parameter*), `string`, *Required* + The unique name of the dataset to create. It must adhere to the following requirements: - Basic Multilingual Plane (BMP) only - Maximum 128 characters - Case-insensitive -- `"avatar"`: (*Body parameter*), `string` +- `"avatar"`: (*Body parameter*), `string` Base64 encoding of the avatar. - Maximum 65535 characters -- `"description"`: (*Body parameter*), `string` +- `"description"`: (*Body parameter*), `string` A brief description of the dataset to create. - Maximum 65535 characters -- `"embedding_model"`: (*Body parameter*), `string` +- `"embedding_model"`: (*Body parameter*), `string` The name of the embedding model to use. For example: `"BAAI/bge-large-zh-v1.5@BAAI"` - Maximum 255 characters - Must follow `model_name@model_factory` format -- `"permission"`: (*Body parameter*), `string` - Specifies who can access the dataset to create. Available options: +- `"permission"`: (*Body parameter*), `string` + Specifies who can access the dataset to create. Available options: - `"me"`: (Default) Only you can manage the dataset. - `"team"`: All team members can manage the dataset. -- `"chunk_method"`: (*Body parameter*), `enum` - The default chunk method of the dataset to create. Mutually exclusive with `"parse_type"` and `"pipeline_id"`. If you set `"chunk_method"`, do not include `"parse_type"` or `"pipeline_id"`. - Available options: +- `"chunk_method"`: (*Body parameter*), `enum` + The default chunk method of the dataset to create. Mutually exclusive with `"parse_type"` and `"pipeline_id"`. If you set `"chunk_method"`, do not include `"parse_type"` or `"pipeline_id"`. + Available options: - `"naive"`: General (default) - `"book"`: Book - `"email"`: Email @@ -533,8 +533,8 @@ curl --request POST \ - `"table"`: Table - `"tag"`: Tag -- `"parser_config"`: (*Body parameter*), `object` - The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: +- `"parser_config"`: (*Body parameter*), `object` + The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes: - `"auto_keywords"`: `int` - Defaults to `0` @@ -569,17 +569,17 @@ curl --request POST \ - `"parent_child"`: `object` Parent-child chunking settings. When enabled, each chunk is further split into smaller child chunks using `children_delimiter`. At retrieval time, matched child chunks are replaced by their parent's full text before being passed to the LLM, giving precise vector matching with broader context. - `"use_parent_child"`: `bool` Whether to enable parent-child chunking. Defaults to `false`. - `"children_delimiter"`: `string` The delimiter used to split a parent chunk into child chunks. Only takes effect when `"use_parent_child"` is `true`. Defaults to `"\n"`. - - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: + - If `"chunk_method"` is `"qa"`, `"manual"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: - `"raptor"`: `object` RAPTOR-specific settings. - Defaults to: `{"use_raptor": false}`. - If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object. -- `"parse_type"`: (*Body parameter*), `int` - The ingestion pipeline parse type identifier, i.e., the number of parsers in your **Parser** component. +- `"parse_type"`: (*Body parameter*), `int` + The ingestion pipeline parse type identifier, i.e., the number of parsers in your **Parser** component. - Required (along with `"pipeline_id"`) if specifying an ingestion pipeline. - Must not be included when `"chunk_method"` is specified. -- `"pipeline_id"`: (*Body parameter*), `string` +- `"pipeline_id"`: (*Body parameter*), `string` The ingestion pipeline ID. Can be found in the corresponding URL in the RAGFlow UI. - Required (along with `"parse_type"`) if specifying an ingestion pipeline. - Must be a 32-character lowercase hexadecimal string, e.g., `"d0bebe30ae2211f0970942010a8e0005"`. @@ -616,10 +616,10 @@ Success: "name": "RAGFlow example", "pagerank": 0, "parser_config": { - "chunk_token_num": 128, - "delimiter": "\\n!?;。;!?", - "html4excel": false, - "layout_recognize": "DeepDOC", + "chunk_token_num": 128, + "delimiter": "\\n!?;。;!?", + "html4excel": false, + "layout_recognize": "DeepDOC", "raptor": { "use_raptor": false } @@ -692,7 +692,7 @@ curl --request DELETE \ Specifies the datasets to delete: - If omitted, or set to `null` or an empty array, no datasets are deleted. - If an array of IDs is provided, only the datasets matching those IDs are deleted. -- `"delete_all"`: (*Body parameter*), `boolean` +- `"delete_all"`: (*Body parameter*), `boolean` Whether to delete all datasets owned by the current user when`"ids"` is omitted, or set to `null` or an empty array. Defaults to `false`. #### Response @@ -701,7 +701,7 @@ Success: ```json { - "code": 0 + "code": 0 } ``` @@ -755,32 +755,32 @@ curl --request PUT \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the dataset to update. -- `"name"`: (*Body parameter*), `string` +- `"name"`: (*Body parameter*), `string` The revised name of the dataset. - Basic Multilingual Plane (BMP) only - Maximum 128 characters - Case-insensitive -- `"avatar"`: (*Body parameter*), `string` +- `"avatar"`: (*Body parameter*), `string` The updated base64 encoding of the avatar. - Maximum 65535 characters -- `"embedding_model"`: (*Body parameter*), `string` - The updated embedding model name. +- `"embedding_model"`: (*Body parameter*), `string` + The updated embedding model name. - Ensure that `"chunk_count"` is `0` before updating `"embedding_model"`. - Maximum 255 characters - Must follow `model_name@model_factory` format -- `"permission"`: (*Body parameter*), `string` - The updated dataset permission. Available options: +- `"permission"`: (*Body parameter*), `string` + The updated dataset permission. Available options: - `"me"`: (Default) Only you can manage the dataset. - `"team"`: All team members can manage the dataset. -- `"pagerank"`: (*Body parameter*), `int` +- `"pagerank"`: (*Body parameter*), `int` refer to [Set page rank](https://ragflow.io/docs/dev/set_page_rank) - Default: `0` - Minimum: `0` - Maximum: `100` -- `"chunk_method"`: (*Body parameter*), `enum` - The chunking method for the dataset. Available options: +- `"chunk_method"`: (*Body parameter*), `enum` + The chunking method for the dataset. Available options: - `"naive"`: General (default) - `"book"`: Book - `"email"`: Email @@ -793,8 +793,8 @@ curl --request PUT \ - `"qa"`: Q&A - `"table"`: Table - `"tag"`: Tag -- `"parser_config"`: (*Body parameter*), `object` - The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: +- `"parser_config"`: (*Body parameter*), `object` + The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes: - `"auto_keywords"`: `int` - Defaults to `0` @@ -826,7 +826,7 @@ curl --request PUT \ - `"parent_child"`: `object` Parent-child chunking settings. When enabled, each chunk is further split into smaller child chunks using `children_delimiter`. At retrieval time, matched child chunks are replaced by their parent's full text before being passed to the LLM, giving precise vector matching with broader context. - `"use_parent_child"`: `bool` Whether to enable parent-child chunking. Defaults to `false`. - `"children_delimiter"`: `string` The delimiter used to split a parent chunk into child chunks. Only takes effect when `"use_parent_child"` is `true`. Defaults to `"\n"`. - - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: + - If `"chunk_method"` is `"qa"`, `"manual"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: - `"raptor"`: `object` RAPTOR-specific settings. - Defaults to: `{"use_raptor": false}`. - If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object. @@ -837,7 +837,7 @@ Success: ```json { - "code": 0 + "code": 0 } ``` @@ -882,21 +882,21 @@ curl --request GET \ ##### Request parameters -- `page`: (*Filter parameter*) +- `page`: (*Filter parameter*) Specifies the page on which the datasets will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*) +- `page_size`: (*Filter parameter*) The number of datasets on each page. Defaults to `30`. -- `orderby`: (*Filter parameter*) +- `orderby`: (*Filter parameter*) The field by which datasets should be sorted. Available options: - `create_time` (default) - `update_time` -- `desc`: (*Filter parameter*) +- `desc`: (*Filter parameter*) Indicates whether the retrieved datasets should be sorted in descending order. Defaults to `true`. -- `name`: (*Filter parameter*) +- `name`: (*Filter parameter*) The name of the dataset to retrieve. -- `id`: (*Filter parameter*) +- `id`: (*Filter parameter*) The ID of the dataset to retrieve. -- `include_parsing_status`: (*Filter parameter*) +- `include_parsing_status`: (*Filter parameter*) Whether to include document parsing status counts in the response. Defaults to `false`. When set to `true`, each dataset object in the response will include the following additional fields: - `unstart_count`: Number of documents not yet started parsing. - `running_count`: Number of documents currently being parsed. @@ -1027,7 +1027,7 @@ curl --request GET \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1107,7 +1107,7 @@ curl --request DELETE \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1155,7 +1155,7 @@ curl --request POST \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1205,7 +1205,7 @@ curl --request GET \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1270,7 +1270,7 @@ curl --request POST \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1320,7 +1320,7 @@ curl --request GET \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the target dataset. #### Response @@ -1396,9 +1396,9 @@ curl --request POST \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the dataset to which the documents will be uploaded. -- `'file'`: (*Body parameter*) +- `'file'`: (*Body parameter*) A document to upload. #### Response @@ -1473,8 +1473,8 @@ curl --request PUT \ --header 'Content-Type: application/json' \ --data ' { - "name": "manual.txt", - "chunk_method": "manual", + "name": "manual.txt", + "chunk_method": "manual", "parser_config": {"chunk_token_num": 128} }' @@ -1482,14 +1482,14 @@ curl --request PUT \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the associated dataset. -- `document_id`: (*Path parameter*) +- `document_id`: (*Path parameter*) The ID of the document to update. - `"name"`: (*Body parameter*), `string` - `"meta_fields"`: (*Body parameter*), `dict[str, Any]` The meta fields of the document. -- `"chunk_method"`: (*Body parameter*), `string` - The parsing method to apply to the document: +- `"chunk_method"`: (*Body parameter*), `string` + The parsing method to apply to the document: - `"naive"`: General - `"manual`: Manual - `"qa"`: Q&A @@ -1501,8 +1501,8 @@ curl --request PUT \ - `"picture"`: Picture - `"one"`: One - `"email"`: Email -- `"parser_config"`: (*Body parameter*), `object` - The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: +- `"parser_config"`: (*Body parameter*), `object` + The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes: - `"chunk_token_num"`: Defaults to `256`. - `"layout_recognize"`: Defaults to `true`. @@ -1510,13 +1510,13 @@ curl --request PUT \ - `"delimiter"`: Defaults to `"\n"`. - `"task_page_size"`: Defaults to `12`. For PDF only. - `"raptor"`: RAPTOR-specific settings. Defaults to: `{"use_raptor": false}`. - - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: + - If `"chunk_method"` is `"qa"`, `"manual"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: - `"raptor"`: RAPTOR-specific settings. Defaults to: `{"use_raptor": false}`. - If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object. -- `"enabled"`: (*Body parameter*), `integer` - Whether the document should be **available** in the knowledge base. - - `1` → (available) - - `0` → (unavailable) +- `"enabled"`: (*Body parameter*), `integer` + Whether the document should be **available** in the knowledge base. + - `1` → (available) + - `0` → (unavailable) #### Response @@ -1640,9 +1640,9 @@ curl --request GET \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `documents_id`: (*Path parameter*) +- `documents_id`: (*Path parameter*) The ID of the document to download. #### Response @@ -1690,30 +1690,30 @@ curl --request GET \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `keywords`: (*Filter parameter*), `string` +- `keywords`: (*Filter parameter*), `string` The keywords used to match document titles. - `page`: (*Filter parameter*), `integer` Specifies the page on which the documents will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*), `integer` +- `page_size`: (*Filter parameter*), `integer` The maximum number of documents on each page. Defaults to `30`. -- `orderby`: (*Filter parameter*), `string` +- `orderby`: (*Filter parameter*), `string` The field by which documents should be sorted. Available options: - `create_time` (default) - `update_time` -- `desc`: (*Filter parameter*), `boolean` +- `desc`: (*Filter parameter*), `boolean` Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`. -- `id`: (*Filter parameter*), `string` +- `id`: (*Filter parameter*), `string` The ID of the document to retrieve. -- `create_time_from`: (*Filter parameter*), `integer` +- `create_time_from`: (*Filter parameter*), `integer` Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`. -- `create_time_to`: (*Filter parameter*), `integer` +- `create_time_to`: (*Filter parameter*), `integer` Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`. -- `suffix`: (*Filter parameter*), `array[string]` +- `suffix`: (*Filter parameter*), `array[string]` Filter by file suffix. Supports multiple values, e.g., `pdf`, `txt`, and `docx`. Defaults to all suffixes. -- `run`: (*Filter parameter*), `array[string]` - Filter by document processing status. Supports numeric, text, and mixed formats: +- `run`: (*Filter parameter*), `array[string]` + Filter by document processing status. Supports numeric, text, and mixed formats: - Numeric format: `["0", "1", "2", "3", "4"]` - Text format: `[UNSTART, RUNNING, CANCEL, DONE, FAIL]` - Mixed format: `[UNSTART, 1, DONE]` (mixing numeric and text formats) @@ -1722,7 +1722,7 @@ curl --request GET \ - `1` / `RUNNING`: Document is currently being processed - `2` / `CANCEL`: Document processing was cancelled - `3` / `DONE`: Document processing completed successfully - - `4` / `FAIL`: Document processing failed + - `4` / `FAIL`: Document processing failed Defaults to all statuses. - `metadata_condition`: (*Filter parameter*), `object` (JSON in query) Optional metadata filter applied to documents when `document_ids` is not provided. Uses the same structure as retrieval: @@ -1847,13 +1847,13 @@ curl --request DELETE \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `"ids"`: (*Body parameter*), `list[string]` +- `"ids"`: (*Body parameter*), `list[string]` The IDs of the documents to delete. - If omitted, or set to `null` or an empty array, no documents are deleted. - If an array of IDs is provided, only the documents matching those IDs are deleted. -- `"delete_all"`: (*Body parameter*), `boolean` +- `"delete_all"`: (*Body parameter*), `boolean` Whether to delete all documents in the specified dataset when `"ids"` is omitted, or set to `null` or an empty array. Defaults to `false`. #### Response @@ -1908,9 +1908,9 @@ curl --request POST \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The dataset ID. -- `"document_ids"`: (*Body parameter*), `list[string]`, *Required* +- `"document_ids"`: (*Body parameter*), `list[string]`, *Required* The IDs of the documents to parse. #### Response @@ -1965,9 +1965,9 @@ curl --request DELETE \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `"document_ids"`: (*Body parameter*), `list[string]`, *Required* +- `"document_ids"`: (*Body parameter*), `list[string]`, *Required* The IDs of the documents for which the parsing should be stopped. #### Response @@ -2006,12 +2006,13 @@ Adds a chunk to a specified document in a specified dataset. - Method: POST - URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks` - Headers: - - `'content-Type: application/json'` + - `'Content-Type: application/json'` - `'Authorization: Bearer '` - Body: - `"content"`: `string` - `"important_keywords"`: `list[string]` - `"tag_kwd"`: `list[string]` + - `"questions"`: `list[string]` - `"image_base64"`: `string` ##### Request example @@ -2032,18 +2033,18 @@ curl --request POST \ - `dataset_id`: (*Path parameter*) The associated dataset ID. -- `document_ids`: (*Path parameter*) +- `document_id`: (*Path parameter*) The associated document ID. - `"content"`: (*Body parameter*), `string`, *Required* The text content of the chunk. -- `"important_keywords`(*Body parameter*), `list[string]` +- `"important_keywords"`: (*Body parameter*), `list[string]` The key terms or phrases to tag with the chunk. - `"tag_kwd"`: (*Body parameter*), `list[string]` Tag keywords to associate with the chunk. -- `"questions"`(*Body parameter*), `list[string]` - If there is a given question, the embedded chunks will be based on them +- `"questions"`: (*Body parameter*), `list[string]` + Optional questions to use when embedding the chunk. - `"image_base64"`: (*Body parameter*), `string` - A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one. + A base64-encoded image to associate with the chunk. #### Response @@ -2098,23 +2099,23 @@ Lists chunks in a specified document. ```bash curl --request GET \ --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks?keywords={keywords}&page={page}&page_size={page_size}&id={chunk_id} \ - --header 'Authorization: Bearer ' + --header 'Authorization: Bearer ' ``` ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `document_id`: (*Path parameter*) +- `document_id`: (*Path parameter*) The associated document ID. -- `keywords`(*Filter parameter*), `string` +- `keywords`(*Filter parameter*), `string` The keywords used to match chunk content. -- `page`(*Filter parameter*), `integer` +- `page`(*Filter parameter*), `integer` Specifies the page on which the chunks will be displayed. Defaults to `1`. -- `page_size`(*Filter parameter*), `integer` - The maximum number of chunks on each page. Defaults to `1024`. -- `id`(*Filter parameter*), `string` - The ID of the chunk to retrieve. +- `page_size`(*Filter parameter*), `integer` + The maximum number of chunks on each page. Defaults to `30`. +- `id`(*Filter parameter*), `string` + The ID of the chunk to retrieve. You can also use `GET /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` to retrieve one chunk. #### Response @@ -2132,11 +2133,9 @@ Success: "document_id": "b330ec2e91ec11efbc510242ac120004", "id": "b48c170e90f70af998485c1065490726", "image_id": "", - "important_keywords": "", + "important_keywords": [], "tag_kwd": [], - "positions": [ - "" - ] + "positions": [] } ], "doc": { @@ -2188,6 +2187,68 @@ Failure: --- +### Get chunk + +**GET** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` + +Retrieves a specified chunk in a specified document. Runtime fields such as vector and token fields are not returned. + +#### Request + +- Method: GET +- URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` +- Headers: + - `'Authorization: Bearer '` + +##### Request example + +```bash +curl --request GET \ + --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id} \ + --header 'Authorization: Bearer ' +``` + +##### Request parameters + +- `dataset_id`: (*Path parameter*) + The associated dataset ID. +- `document_id`: (*Path parameter*) + The associated document ID. +- `chunk_id`: (*Path parameter*) + The ID of the chunk to retrieve. + +#### Response + +Success: + +```json +{ + "code": 0, + "data": { + "available_int": 1, + "content_with_weight": "This is a test content.", + "doc_id": "b330ec2e91ec11efbc510242ac120004", + "docnm_kwd": "1.txt", + "id": "b48c170e90f70af998485c1065490726", + "img_id": "", + "important_kwd": [], + "question_kwd": [], + "tag_kwd": [] + } +} +``` + +Failure: + +```json +{ + "code": 100, + "message": "Chunk not found" +} +``` + +--- + ### Delete chunks **DELETE** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks` @@ -2199,7 +2260,7 @@ Deletes chunks by ID. - Method: DELETE - URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks` - Headers: - - `'content-Type: application/json'` + - `'Content-Type: application/json'` - `'Authorization: Bearer '` - Body: - `"chunk_ids"`: `list[string]` @@ -2230,16 +2291,16 @@ curl --request DELETE \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `document_ids`: (*Path parameter*) +- `document_id`: (*Path parameter*) The associated document ID. -- `"chunk_ids"`: (*Body parameter*), `list[string]` +- `"chunk_ids"`: (*Body parameter*), `list[string]` The IDs of the chunks to delete. - If omitted, or set to `null` or an empty array, no chunks are deleted. - If an array of IDs is provided, only the chunks matching those IDs are deleted. -- `"delete_all"`: (*Body parameter*), `boolean` - Whether to delete all chunks of the specified documen when `"chunk_ids"` is omitted, or set to`null` or an empty array. Defaults to `false`. +- `"delete_all"`: (*Body parameter*), `boolean` + Whether to delete all chunks of the specified document when `"chunk_ids"` is omitted, or set to `null` or an empty array. Defaults to `false`. #### Response @@ -2256,7 +2317,7 @@ Failure: ```json { "code": 102, - "message": "`chunk_ids` is required" + "message": "rm_chunk deleted chunks 0, expect 1" } ``` @@ -2264,55 +2325,64 @@ Failure: ### Update chunk -**PUT** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` +**PATCH** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` Updates content or configurations for a specified chunk. #### Request -- Method: PUT +- Method: PATCH - URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}` - Headers: - - `'content-Type: application/json'` + - `'Content-Type: application/json'` - `'Authorization: Bearer '` - Body: - `"content"`: `string` - `"important_keywords"`: `list[string]` + - `"questions"`: `list[string]` + - `"positions"`: `list` - `"tag_kwd"`: `list[string]` - `"available"`: `boolean` + - `"image_base64"`: `string` ##### Request example ```bash -curl --request PUT \ +curl --request PATCH \ --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id} \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer ' \ --data ' - { - "content": "ragflow123", - "important_keywords": [] + { + "content": "ragflow123", + "important_keywords": [] }' ``` ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `document_ids`: (*Path parameter*) +- `document_id`: (*Path parameter*) The associated document ID. -- `chunk_id`: (*Path parameter*) +- `chunk_id`: (*Path parameter*) The ID of the chunk to update. -- `"content"`: (*Body parameter*), `string` +- `"content"`: (*Body parameter*), `string` The text content of the chunk. -- `"important_keywords"`: (*Body parameter*), `list[string]` +- `"important_keywords"`: (*Body parameter*), `list[string]` A list of key terms or phrases to tag with the chunk. -- `"tag_kwd"`: (*Body parameter*), `list[string]` +- `"questions"`: (*Body parameter*), `list[string]` + Optional questions to use when embedding the chunk. +- `"positions"`: (*Body parameter*), `list` + Updated source positions for the chunk. +- `"tag_kwd"`: (*Body parameter*), `list[string]` Updated tag keywords. -- `"available"`: (*Body parameter*) `boolean` - The chunk's availability status in the dataset. Value options: +- `"available"`: (*Body parameter*) `boolean` + The chunk's availability status in the dataset. Value options: - `true`: Available (default) - `false`: Unavailable +- `"image_base64"`: (*Body parameter*), `string` + Base64-encoded image content to associate with the chunk. #### Response @@ -2337,14 +2407,14 @@ Failure: ### Update chunk availability -**POST** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/switch` +**PATCH** `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks` Updates or switches the availability status of specified chunks, controlling whether they are available for retrieval. #### Request -- Method: POST -- URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/switch` +- Method: PATCH +- URL: `/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks` - Headers: - `'Content-Type: application/json'` - `'Authorization: Bearer '` @@ -2356,8 +2426,8 @@ Updates or switches the availability status of specified chunks, controlling whe ##### Request example ```bash -curl --request POST \ - --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/switch \ +curl --request PATCH \ + --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer ' \ --data ' @@ -2369,18 +2439,18 @@ curl --request POST \ ##### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The ID of the dataset. -- `document_id`: (*Path parameter*) +- `document_id`: (*Path parameter*) The ID of the document. -- `"chunk_ids"`: (*Body parameter*), `list[string]` (*Required*) +- `"chunk_ids"`: (*Body parameter*), `list[string]` (*Required*) IDs of the chunks whose availability status is to be updated. -- `"available_int"`: (*Body parameter*), `integer` (*Optional*) - Availability status for the specified chunks. Mutually exclusive with `"available"`. You must provide either `available_int` or `available`, *not* both. +- `"available_int"`: (*Body parameter*), `integer` (*Optional*) + Availability status for the specified chunks. You must provide either `"available_int"` or `"available"`. If both are provided, `"available_int"` is used. - `1`: Available, - `0`: Unavailable. -- `"available"`: (*Body parameter*), `boolean` (*Optional*) - Availability status of the specified chunks. Mutually exclusive with `"available_int"`. You must provide either `available` or `available_int`, *not* both. +- `"available"`: (*Body parameter*), `boolean` (*Optional*) + Availability status of the specified chunks. Used when `"available_int"` is not provided. - `true`: Available, - `false`: Unavailable. @@ -2399,35 +2469,35 @@ Failure: ```json { - "code": 101, + "code": 102, "message": "You don't own the dataset {dataset_id}." } ``` ```json { - "code": 101, + "code": 102, "message": "`chunk_ids` is required." } ``` ```json { - "code": 101, + "code": 102, "message": "`available_int` or `available` is required." } ``` ```json { - "code": 101, + "code": 102, "message": "Document not found!" } ``` ```json { - "code": 101, + "code": 102, "message": "Index updating failure" } ``` @@ -2491,18 +2561,18 @@ Batch update or delete document-level metadata within a specified dataset. If bo #### Request parameters -- `dataset_id`: (*Path parameter*) +- `dataset_id`: (*Path parameter*) The associated dataset ID. -- `"selector"`: (*Body parameter*), `object`, *optional* - A document selector: - - `"document_ids"`: `list[string]` *optional* - The associated document ID. - - `"metadata_condition"`: `object`, *optional* +- `"selector"`: (*Body parameter*), `object`, *optional* + A document selector: + - `"document_ids"`: `list[string]` *optional* + The associated document ID. + - `"metadata_condition"`: `object`, *optional* - `"logic"`: Defines the logic relation between conditions if multiple conditions are provided. Options: - `"and"` (default) - `"or"` - - `"conditions"`: `list[object]` *optional* - Each object: `{ "name": string, "comparison_operator": string, "value": string }` + - `"conditions"`: `list[object]` *optional* + Each object: `{ "name": string, "comparison_operator": string, "value": string }` - `"name"`: `string` The key name to search by. - `"comparison_operator"`: `string` Available options: - `"is"` @@ -2519,14 +2589,14 @@ Batch update or delete document-level metadata within a specified dataset. If bo - `"≤"` - `"empty"` - `"not empty"` - - `"value"`: `string` The key value to search by. -- `"updates"`: (*Body parameter*), `list[object]`, *optional* - Replaces metadata of the retrieved documents. Each object: `{ "key": string, "match": string, "value": string }`. + - `"value"`: `string` The key value to search by. +- `"updates"`: (*Body parameter*), `list[object]`, *optional* + Replaces metadata of the retrieved documents. Each object: `{ "key": string, "match": string, "value": string }`. - `"key"`: `string` The name of the key to update. - `"match"`: `string` *optional* The current value of the key to update. When omitted, the corresponding keys are updated to `"value"` regardless of their current values. - `"value"`: `string` The new value to set for the specified keys. -- `"deletes`: (*Body parameter*), `list[ojbect]`, *optional* - Deletes metadata of the retrieved documents. Each object: `{ "key": string, "value": string }`. +- `"deletes"`: (*Body parameter*), `list[object]`, *optional* + Deletes metadata of the retrieved documents. Each object: `{ "key": string, "value": string }`. - `"key"`: `string` The name of the key to delete. - `"value"`: `string` *Optional* The value of the key to delete. - When provided, only keys with a matching value are deleted. @@ -2588,16 +2658,16 @@ Retrieves chunks from specified datasets. - `'content-Type: application/json'` - `'Authorization: Bearer '` - Body: - - `"question"`: `string` - - `"dataset_ids"`: `list[string]` + - `"question"`: `string` + - `"dataset_ids"`: `list[string]` - `"document_ids"`: `list[string]` - - `"page"`: `integer` - - `"page_size"`: `integer` - - `"similarity_threshold"`: `float` - - `"vector_similarity_weight"`: `float` - - `"top_k"`: `integer` - - `"rerank_id"`: `string` - - `"keyword"`: `boolean` + - `"page"`: `integer` + - `"page_size"`: `integer` + - `"similarity_threshold"`: `float` + - `"vector_similarity_weight"`: `float` + - `"top_k"`: `integer` + - `"rerank_id"`: `string` + - `"keyword"`: `boolean` - `"highlight"`: `boolean` - `"cross_languages"`: `list[string]` - `"metadata_condition"`: `object` @@ -2636,45 +2706,45 @@ curl --request POST \ ##### Request parameter -- `"question"`: (*Body parameter*), `string`, *Required* +- `"question"`: (*Body parameter*), `string`, *Required* The user query or query keywords. -- `"dataset_ids"`: (*Body parameter*) `list[string]` +- `"dataset_ids"`: (*Body parameter*) `list[string]` The IDs of the datasets to search. If you do not set this argument, ensure that you set `"document_ids"`. -- `"document_ids"`: (*Body parameter*), `list[string]` +- `"document_ids"`: (*Body parameter*), `list[string]` The IDs of the documents to search. Ensure that all selected documents use the same embedding model. Otherwise, an error will occur. If you do not set this argument, ensure that you set `"dataset_ids"`. -- `"page"`: (*Body parameter*), `integer` +- `"page"`: (*Body parameter*), `integer` Specifies the page on which the chunks will be displayed. Defaults to `1`. -- `"page_size"`: (*Body parameter*) +- `"page_size"`: (*Body parameter*) The maximum number of chunks on each page. Defaults to `30`. -- `"similarity_threshold"`: (*Body parameter*) +- `"similarity_threshold"`: (*Body parameter*) The minimum similarity score. Defaults to `0.2`. -- `"vector_similarity_weight"`: (*Body parameter*), `float` +- `"vector_similarity_weight"`: (*Body parameter*), `float` The weight of vector cosine similarity. Defaults to `0.3`. If x represents the weight of vector cosine similarity, then (1 - x) is the term similarity weight. -- `"top_k"`: (*Body parameter*), `integer` +- `"top_k"`: (*Body parameter*), `integer` The number of chunks engaged in vector cosine computation. Defaults to `1024`. -- `"use_kg"`: (*Body parameter*), `boolean` +- `"use_kg"`: (*Body parameter*), `boolean` Whether to search chunks related to the generated knowledge graph for multi-hop queries. Defaults to `False`. Before enabling this, ensure you have successfully constructed a knowledge graph for the specified datasets. See [here](../guides/dataset/advanced/construct_knowledge_graph.md) for details. -- `"toc_enhance"`: (*Body parameter*), `boolean` +- `"toc_enhance"`: (*Body parameter*), `boolean` Whether to search chunks with extracted table of content. Defaults to `False`. Before enabling this, ensure you have enabled `TOC_Enhance` and successfully extracted table of contents for the specified datasets. See [here](https://ragflow.io/docs/dev/enable_table_of_contents) for details. -- `"rerank_id"`: (*Body parameter*), `integer` +- `"rerank_id"`: (*Body parameter*), `integer` The ID of the rerank model. -- `"keyword"`: (*Body parameter*), `boolean` - Indicates whether to enable keyword-based matching: +- `"keyword"`: (*Body parameter*), `boolean` + Indicates whether to enable keyword-based matching: - `true`: Enable keyword-based matching. - `false`: Disable keyword-based matching (default). -- `"highlight"`: (*Body parameter*), `boolean` - Specifies whether to enable highlighting of matched terms in the results: +- `"highlight"`: (*Body parameter*), `boolean` + Specifies whether to enable highlighting of matched terms in the results: - `true`: Enable highlighting of matched terms. - `false`: Disable highlighting of matched terms (default). -- `"cross_languages"`: (*Body parameter*) `list[string]` +- `"cross_languages"`: (*Body parameter*) `list[string]` The languages that should be translated into, in order to achieve keywords retrievals in different languages. -- `"metadata_condition"`: (*Body parameter*), `object` - The metadata condition used for filtering chunks: +- `"metadata_condition"`: (*Body parameter*), `object` + The metadata condition used for filtering chunks: - `"logic"`: (*Body parameter*), `string` - `"and"`: Return only results that satisfy *every* condition (default). - `"or"`: Return results that satisfy *any* condition. - - `"conditions"`: (*Body parameter*), `array` - A list of metadata filter conditions. + - `"conditions"`: (*Body parameter*), `array` + A list of metadata filter conditions. - `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details. - `comparison_operator`: `string` - The comparison operator. Can be one of: - `"contains"` @@ -2783,9 +2853,9 @@ curl --request POST \ ##### Request parameters -- `"name"`: (*Body parameter*), `string`, *Required* +- `"name"`: (*Body parameter*), `string`, *Required* The name of the chat assistant. -- `"icon"`: (*Body parameter*), `string` +- `"icon"`: (*Body parameter*), `string` Base64 encoding of the avatar. - `"dataset_ids"`: (*Body parameter*), `list[string]` The unique identifiers for the associated datasets. If omitted or set to `[]`, an empty chat assistant is created; datasets can be attached at a later time. @@ -2793,21 +2863,21 @@ curl --request POST \ The identifier of the chat model. If not specified, the system defaults to the user's pre-configured chat model. - `"llm_setting"`: (*Body parameter*), `object` A configuration object defining the LLM parameters for the assistant. The `llm_setting` object may contain the following attributes: - - `"model_type"`: `string` + - `"model_type"`: `string` A model type specifier. Only `"chat"` and `"image2text"` are recognized; any other inputs, or when omitted, are treated as `"chat"`. - - `"temperature"`: `float` - Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to `0.1`. - - `"top_p"`: `float` - Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to `0.3` - - `"presence_penalty"`: `float` + - `"temperature"`: `float` + Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to `0.1`. + - `"top_p"`: `float` + Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to `0.3` + - `"presence_penalty"`: `float` This discourages the model from repeating the same information by penalizing words that have already appeared in the conversation. Defaults to `0.4`. - - `"frequency penalty"`: `float` + - `"frequency penalty"`: `float` Similar to the presence penalty, this reduces the model’s tendency to repeat the same words frequently. Defaults to `0.7`. -- `"prompt_config"`: (*Body parameter*), `object` - Instructions for the LLM to follow. A `prompt_config` object may contain the following attributes: +- `"prompt_config"`: (*Body parameter*), `object` + Instructions for the LLM to follow. A `prompt_config` object may contain the following attributes: - `"system"`: `string` The prompt content. - `"prologue"`: `string` The opening greeting for the user. - - `"parameters"`: `object[]` This argument lists the variables to use in the system prompt. Note that: + - `"parameters"`: `object[]` This argument lists the variables to use in the system prompt. Note that: - `"knowledge"` is a reserved variable, which represents the retrieved chunks. - All the variables in `"system"` should be curly bracketed. - `"empty_response"`: `string` If nothing is retrieved in the dataset for the user's question, this will be used as the response. To allow the LLM to improvise when nothing is found, leave this blank. @@ -2944,27 +3014,27 @@ curl --request PUT \ #### Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the chat assistant to update. -- `"name"`: (*Body parameter*), `string`, *Required* +- `"name"`: (*Body parameter*), `string`, *Required* The revised name of the chat assistant. -- `"icon"`: (*Body parameter*), `string` +- `"icon"`: (*Body parameter*), `string` Base64 encoding of the avatar. - `"dataset_ids"`: (*Body parameter*), `list[string]` The IDs of the associated datasets. -- `"llm_id"`: (*Body parameter*), `string` - The chat model name. If not set, the user's default chat model is used. -- `"llm_setting"`: (*Body parameter*), `object` - The LLM settings for the chat assistant. An `llm_setting` object contains the following attributes: +- `"llm_id"`: (*Body parameter*), `string` + The chat model name. If not set, the user's default chat model is used. +- `"llm_setting"`: (*Body parameter*), `object` + The LLM settings for the chat assistant. An `llm_setting` object contains the following attributes: - `"model_type"`: `string` A model type specifier. Supported values are `"chat"` and `"image2text"`. If the field is omitted or an unrecognized value is provided, it defaults to `"chat"`. - - `"temperature"`: `float` - Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to `0.1`. - - `"top_p"`: `float` - Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to `0.3` - - `"presence_penalty"`: `float` + - `"temperature"`: `float` + Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to `0.1`. + - `"top_p"`: `float` + Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to `0.3` + - `"presence_penalty"`: `float` This discourages the model from repeating the same information by penalizing words that have already appeared in the conversation. Defaults to `0.4`. - - `"frequency penalty"`: `float` + - `"frequency penalty"`: `float` Similar to the presence penalty, this reduces the model’s tendency to repeat the same words frequently. Defaults to `0.7`. - `"prompt_config"`: (*Body parameter*), `object` - `"similarity_threshold"`: (*Body parameter*), `float` @@ -3252,11 +3322,11 @@ curl --request DELETE \ ##### Request parameters -- `"ids"`: (*Body parameter*), `list[string]` +- `"ids"`: (*Body parameter*), `list[string]` The IDs of the chat assistants to delete. - If omitted, or set to `null` or an empty array, no chat assistants are deleted. - If an array of IDs is provided, only the chat assistants matching those IDs are deleted. -- `"delete_all"`: (*Body parameter*), `boolean` +- `"delete_all"`: (*Body parameter*), `boolean` Whether to delete all chat assistants owned by the current user when `"ids"` is omitted, or set to`null` or an empty array. Defaults to `false`. #### Response @@ -3425,11 +3495,11 @@ curl --request POST \ ##### Request parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `"name"`: (*Body parameter*), `string` +- `"name"`: (*Body parameter*), `string` The name of the chat session to create. -- `"user_id"`: (*Body parameter*), `string` +- `"user_id"`: (*Body parameter*), `string` Optional user-defined ID. #### Response @@ -3566,23 +3636,23 @@ curl --request GET \ ##### Request Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `page`: (*Filter parameter*), `integer` +- `page`: (*Filter parameter*), `integer` Specifies the page on which the sessions will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*), `integer` +- `page_size`: (*Filter parameter*), `integer` The number of sessions on each page. Defaults to `30`. If set to `0`, an empty list is returned. -- `orderby`: (*Filter parameter*), `string` - The field by which sessions should be sorted. Available options: +- `orderby`: (*Filter parameter*), `string` + The field by which sessions should be sorted. Available options: - `create_time` (default) - `update_time` -- `desc`: (*Filter parameter*), `boolean` +- `desc`: (*Filter parameter*), `boolean` Indicates whether the retrieved sessions should be sorted in descending order. Defaults to `true`. -- `name`: (*Filter parameter*) `string` +- `name`: (*Filter parameter*) `string` The name of the chat session to retrieve. -- `id`: (*Filter parameter*), `string` +- `id`: (*Filter parameter*), `string` The ID of the chat session to retrieve. -- `user_id`: (*Filter parameter*), `string` +- `user_id`: (*Filter parameter*), `string` The optional user-defined ID passed in when creating session. #### Response @@ -3648,9 +3718,9 @@ curl --request GET \ ##### Request Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `session_id`: (*Path parameter*) +- `session_id`: (*Path parameter*) The ID of the session to retrieve. #### Response @@ -3710,11 +3780,11 @@ curl --request DELETE \ ##### Request Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `session_id`: (*Path parameter*) +- `session_id`: (*Path parameter*) The ID of the session that owns the message. -- `msg_id`: (*Path parameter*) +- `msg_id`: (*Path parameter*) The ID of the message to delete. #### Response @@ -3776,15 +3846,15 @@ curl --request PUT \ ##### Request Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `session_id`: (*Path parameter*) +- `session_id`: (*Path parameter*) The ID of the session that owns the message. -- `msg_id`: (*Path parameter*) +- `msg_id`: (*Path parameter*) The ID of the assistant message to update. -- `"thumbup"`: (*Body parameter*), `boolean` +- `"thumbup"`: (*Body parameter*), `boolean` Whether the assistant message is marked as positive feedback. -- `"feedback"`: (*Body parameter*), `string` +- `"feedback"`: (*Body parameter*), `string` Optional feedback text, typically used when `"thumbup"` is `false`. #### Response @@ -3863,13 +3933,13 @@ curl --request DELETE \ ##### Request Parameters -- `chat_id`: (*Path parameter*) +- `chat_id`: (*Path parameter*) The ID of the associated chat assistant. -- `"ids"`: (*Body Parameter*), `list[string]` +- `"ids"`: (*Body Parameter*), `list[string]` The IDs of the sessions to delete. - If omitted, or set to `null` or an empty array, no sessions are deleted. - If an array of IDs is provided, only the sessions matching those IDs are deleted. -- `"delete_all"`: (*Body Parameter*), `boolean` +- `"delete_all"`: (*Body Parameter*), `boolean` Whether to delete all sessions of the specified chat assistant when `"ids"` is omitted, or set to `null` or an empty array. Defaults to `false`. #### Response @@ -3975,17 +4045,17 @@ curl --request POST \ ##### Request Parameters -- `"messages"`: (*Body Parameter*), `list[object]`, *Required* +- `"messages"`: (*Body Parameter*), `list[object]`, *Required* The conversation messages sent to the model. -- `"stream"`: (*Body Parameter*), `boolean` +- `"stream"`: (*Body Parameter*), `boolean` Indicates whether to output responses in a streaming way: - `true`: Enable streaming (default). - `false`: Disable streaming. -- `"chat_id"`: (*Body Parameter*) +- `"chat_id"`: (*Body Parameter*) Optional chat assistant ID. If omitted, the tenant's default chat model is used directly. -- `"session_id"`: (*Body Parameter*) +- `"session_id"`: (*Body Parameter*) Optional session ID. If `chat_id` is provided but `session_id` is omitted, a new session will be generated automatically. -- `"llm_id"`: (*Body Parameter*), `string` +- `"llm_id"`: (*Body Parameter*), `string` Optional model override when a specific chat model should be used for this request. #### Response @@ -4136,9 +4206,9 @@ curl --request POST \ ##### Request parameters -- `agent_id`: (*Path parameter*) +- `agent_id`: (*Path parameter*) The ID of the associated agent. -- `user_id`: (*Filter parameter*) +- `user_id`: (*Filter parameter*) The optional user-defined ID for parsing docs (especially images) when creating a session while uploading files. #### Response @@ -4350,7 +4420,7 @@ Failure: ### Converse with agent -**POST** `/api/v1/agents/{agent_id}/completions` +**POST** `/api/v1/agents/{agent_id}/completions` Asks a specified agent a question to start an AI-powered conversation. @@ -4413,7 +4483,7 @@ curl --request POST \ }' ``` -- If the **Begin** component takes parameters, include their values in the body of `"inputs"` as follows: +- If the **Begin** component takes parameters, include their values in the body of `"inputs"` as follows: ```bash curl --request POST \ @@ -4466,24 +4536,24 @@ curl --request POST \ ##### Request Parameters -- `agent_id`: (*Path parameter*), `string` +- `agent_id`: (*Path parameter*), `string` The ID of the associated agent. -- `"question"`: (*Body Parameter*), `string`, *Required* +- `"question"`: (*Body Parameter*), `string`, *Required* The question to start an AI-powered conversation. -- `"stream"`: (*Body Parameter*), `boolean` - Indicates whether to output responses in a streaming way: +- `"stream"`: (*Body Parameter*), `boolean` + Indicates whether to output responses in a streaming way: - `true`: Enable streaming (default). - `false`: Disable streaming. -- `"session_id"`: (*Body Parameter*) +- `"session_id"`: (*Body Parameter*) The ID of the session. If it is not provided, a new session will be generated. -- `"inputs"`: (*Body Parameter*) - Variables specified in the **Begin** component. -- `"user_id"`: (*Body parameter*), `string` +- `"inputs"`: (*Body Parameter*) + Variables specified in the **Begin** component. +- `"user_id"`: (*Body parameter*), `string` The optional user-defined ID. Valid *only* when no `session_id` is provided. :::tip NOTE -For now, this method does *not* support a file type input/variable. As a workaround, use the following to upload a file to an agent: -`http://{address}/v1/canvas/upload/{agent_id}` +For now, this method does *not* support a file type input/variable. As a workaround, use the following to upload a file to an agent: +`http://{address}/v1/canvas/upload/{agent_id}` *You will get a corresponding file ID from its response body.* ::: @@ -5034,23 +5104,23 @@ curl --request GET \ ##### Request Parameters -- `agent_id`: (*Path parameter*) +- `agent_id`: (*Path parameter*) The ID of the associated agent. -- `page`: (*Filter parameter*), `integer` +- `page`: (*Filter parameter*), `integer` Specifies the page on which the sessions will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*), `integer` +- `page_size`: (*Filter parameter*), `integer` The number of sessions on each page. Defaults to `30`. -- `orderby`: (*Filter parameter*), `string` - The field by which sessions should be sorted. Available options: +- `orderby`: (*Filter parameter*), `string` + The field by which sessions should be sorted. Available options: - `create_time` (default) - `update_time` -- `desc`: (*Filter parameter*), `boolean` +- `desc`: (*Filter parameter*), `boolean` Indicates whether the retrieved sessions should be sorted in descending order. Defaults to `true`. -- `id`: (*Filter parameter*), `string` +- `id`: (*Filter parameter*), `string` The ID of the agent session to retrieve. -- `user_id`: (*Filter parameter*), `string` +- `user_id`: (*Filter parameter*), `string` The optional user-defined ID passed in when creating session. -- `dsl`: (*Filter parameter*), `boolean` +- `dsl`: (*Filter parameter*), `boolean` Indicates whether to include the dsl field of the sessions in the response. Defaults to `true`. #### Response @@ -5247,13 +5317,13 @@ curl --request DELETE \ ##### Request Parameters -- `agent_id`: (*Path parameter*) +- `agent_id`: (*Path parameter*) The ID of the associated agent. -- `"ids"`: (*Body Parameter*), `list[string]` +- `"ids"`: (*Body Parameter*), `list[string]` The IDs of the sessions to delete. - If omitted, or set to `null` or an empty array, no sessions are deleted. - If an array of IDs is provided, only the sessions matching those IDs are deleted. -- `"delete_all"`: (*Body Parameter*), `boolean` +- `"delete_all"`: (*Body Parameter*), `boolean` Whether to delete all sessions of the specified agent when `"ids"` is omitted, or set to `null` or an empty array. Defaults to `false`. #### Response @@ -5532,19 +5602,19 @@ curl --request GET \ ##### Request parameters -- `page`: (*Filter parameter*), `integer` +- `page`: (*Filter parameter*), `integer` Specifies the page on which the agents will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*), `integer` +- `page_size`: (*Filter parameter*), `integer` The number of agents on each page. Defaults to `30`. -- `orderby`: (*Filter parameter*), `string` +- `orderby`: (*Filter parameter*), `string` The attribute by which the results are sorted. Available options: - `create_time` (default) - `update_time` -- `desc`: (*Filter parameter*), `boolean` +- `desc`: (*Filter parameter*), `boolean` Indicates whether the retrieved agents should be sorted in descending order. Defaults to `true`. -- `id`: (*Filter parameter*), `string` +- `id`: (*Filter parameter*), `string` The ID of the agent to retrieve. -- `title`: (*Filter parameter*), `string` +- `title`: (*Filter parameter*), `string` The name of the agent to retrieve. #### Response @@ -5656,11 +5726,11 @@ curl --request POST \ ##### Request parameters -- `title`: (*Body parameter*), `string`, *Required* +- `title`: (*Body parameter*), `string`, *Required* The title of the agent. -- `description`: (*Body parameter*), `string` +- `description`: (*Body parameter*), `string` The description of the agent. Defaults to `None`. -- `dsl`: (*Body parameter*), `object`, *Required* +- `dsl`: (*Body parameter*), `object`, *Required* The canvas DSL object of the agent. #### Response @@ -5722,13 +5792,13 @@ curl --request PUT \ ##### Request parameters -- `agent_id`: (*Path parameter*), `string` +- `agent_id`: (*Path parameter*), `string` The id of the agent to be updated. -- `title`: (*Body parameter*), `string` +- `title`: (*Body parameter*), `string` The title of the agent. -- `description`: (*Body parameter*), `string` +- `description`: (*Body parameter*), `string` The description of the agent. -- `dsl`: (*Body parameter*), `object` +- `dsl`: (*Body parameter*), `object` The canvas DSL object of the agent. Only specify the parameter you want to change in the request body. If a parameter does not exist or is `None`, it won't be updated. @@ -5782,7 +5852,7 @@ curl --request DELETE \ ##### Request parameters -- `agent_id`: (*Path parameter*), `string` +- `agent_id`: (*Path parameter*), `string` The id of the agent to be deleted. #### Response @@ -5828,7 +5898,7 @@ Create a new memory. - Body: - `"name"`: `string` - `"memory_type"`: `list[string]` - - `"embd_id"`: `string`. + - `"embd_id"`: `string`. - `"llm_id"`: `string` ##### Request example @@ -6130,13 +6200,13 @@ Failure: **GET** `/api/v1/memories/{memory_id}/config` -Get the configuration of a specified memory. +Get the configuration of a specified memory. #### Request - Method: GET - URL: `/api/v1/memories/{memory_id}/config` -- Headers: +- Headers: - `'Content-Type: application/json'` - `'Authorization: Bearer '` @@ -6214,7 +6284,7 @@ Delete a specified memory. - Method: DELETE - URL: `/api/v1/memories/{memory_id}` - Headers: -- Headers: +- Headers: - `'Content-Type: application/json'` - `'Authorization: Bearer '` @@ -6533,7 +6603,7 @@ Failure Update message status, enable or disable a message. Once a message is disabled, it will not be retrieved by agents. -#### Request +#### Request - Method: PUT - URL: `/api/v1/messages/{memory_id}:{message_id}` @@ -6613,11 +6683,11 @@ curl --location 'http://{address}/api/v1/messages/search?query=%22who%20are%20yo ##### Request parameters -- `question`: (*Filter parameter*), `string`, *Required* +- `question`: (*Filter parameter*), `string`, *Required* The search term or natural language question used to find relevant messages. -- `memory_id`: (*Filter parameter*), `string` or `list[string]`, *Required* +- `memory_id`: (*Filter parameter*), `string` or `list[string]`, *Required* The IDs of the memories to search. Supports multiple values. @@ -6711,7 +6781,7 @@ curl --location 'http://{address}/api/v1/messages?memory_id=6c8983badede11f083f1 ##### Request parameters -- `memory_id`: (*Filter parameter*), `string` or `list[string]`, *Required* +- `memory_id`: (*Filter parameter*), `string` or `list[string]`, *Required* The IDs of the memories to search. Supports multiple values. @@ -6870,7 +6940,7 @@ curl --request GET ##### Request parameters -- `address`: (*Path parameter*), string +- `address`: (*Path parameter*), string The host and port of the backend service (e.g., `localhost:7897`). --- @@ -6913,11 +6983,11 @@ Content-Type: application/json } ``` -Explanation: +Explanation: -- Each service is reported as "ok" or "nok". -- The top-level `status` reflects overall health. -- If any service is "nok", detailed error info appears in `_meta`. +- Each service is reported as "ok" or "nok". +- The top-level `status` reflects overall health. +- If any service is "nok", detailed error info appears in `_meta`. --- @@ -6956,9 +7026,9 @@ curl --request POST \ ##### Request parameters -- `'file'`: (*Form parameter*), `file`, *Required* +- `'file'`: (*Form parameter*), `file`, *Required* The file(s) to upload. Multiple files can be uploaded in a single request. -- `'parent_id'`: (*Form parameter*), `string` +- `'parent_id'`: (*Form parameter*), `string` The parent folder ID where the file will be uploaded. If not specified, files will be uploaded to the root folder. #### Response @@ -7033,9 +7103,9 @@ curl --request POST \ ##### Request parameters -- `'file'`: (*Form parameter*), `file`, *Optional* +- `'file'`: (*Form parameter*), `file`, *Optional* The file to upload. Mutually exclusive with `url`; either `file` or `url` must be provided. -- `url`: (*Query parameter*), `string`, *Optional* +- `url`: (*Query parameter*), `string`, *Optional* A URL to crawl and store as an attachment. Mutually exclusive with `file`; either `url` or `file` must be provided. #### Response @@ -7096,10 +7166,10 @@ curl --request GET \ ##### Request parameters -- `attachment_id`: (*Path parameter*), `string`, *Required* +- `attachment_id`: (*Path parameter*), `string`, *Required* The `id` value returned by the [Upload document](#upload-document) method. -- `ext`: (*Query parameter*), `string`, *Optional* - A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values: +- `ext`: (*Query parameter*), `string`, *Optional* + A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values: - `"markdown"` - `"html"` - `"pdf"` @@ -7158,11 +7228,11 @@ curl --request POST \ ##### Request parameters -- `"name"`: (*Body parameter*), `string`, *Required* +- `"name"`: (*Body parameter*), `string`, *Required* The name of the file or folder to create. -- `"parent_id"`: (*Body parameter*), `string` +- `"parent_id"`: (*Body parameter*), `string` The parent folder ID. If not specified, the file/folder will be created in the root folder. -- `"type"`: (*Body parameter*), `string` +- `"type"`: (*Body parameter*), `string` The type of the file to create. Available options: - `"folder"`: Create a folder - `"virtual"`: Create a virtual file @@ -7219,18 +7289,18 @@ curl --request GET \ ##### Request parameters -- `parent_id`: (*Filter parameter*), `string` +- `parent_id`: (*Filter parameter*), `string` The folder ID to list files from. If not specified, the root folder is used by default. -- `keywords`: (*Filter parameter*), `string` +- `keywords`: (*Filter parameter*), `string` Search keyword to filter files by name. -- `page`: (*Filter parameter*), `integer` +- `page`: (*Filter parameter*), `integer` Specifies the page on which the files will be displayed. Defaults to `1`. -- `page_size`: (*Filter parameter*), `integer` +- `page_size`: (*Filter parameter*), `integer` The number of files on each page. Defaults to `15`. -- `orderby`: (*Filter parameter*), `string` +- `orderby`: (*Filter parameter*), `string` The field by which files should be sorted. Available options: - `create_time` (default) -- `desc`: (*Filter parameter*), `boolean` +- `desc`: (*Filter parameter*), `boolean` Indicates whether the retrieved files should be sorted in descending order. Defaults to `true`. #### Response @@ -7294,7 +7364,7 @@ curl --request GET \ ##### Request parameters -- `file_id`: (*Path parameter*), `string`, *Required* +- `file_id`: (*Path parameter*), `string`, *Required* The ID of the file whose immediate parent folder to retrieve. #### Response @@ -7347,7 +7417,7 @@ curl --request GET \ ##### Request parameters -- `file_id`: (*Path parameter*), `string`, *Required* +- `file_id`: (*Path parameter*), `string`, *Required* The ID of the file whose parent folders to retrieve. #### Response @@ -7413,7 +7483,7 @@ curl --request DELETE \ ##### Request parameters -- `"ids"`: (*Body parameter*), `list[string]`, *Required* +- `"ids"`: (*Body parameter*), `list[string]`, *Required* The IDs of the files or folders to delete. #### Response @@ -7462,7 +7532,7 @@ curl --request GET \ ##### Request parameters -- `file_id`: (*Path parameter*), `string`, *Required* +- `file_id`: (*Path parameter*), `string`, *Required* The ID of the file to download. #### Response @@ -7613,9 +7683,9 @@ curl --request POST \ ##### Request parameters -- `"file_ids"`: (*Body parameter*), `list[string]`, *Required* +- `"file_ids"`: (*Body parameter*), `list[string]`, *Required* The IDs of the files to convert. If a folder ID is provided, all files within that folder will be converted. -- `"kb_ids"`: (*Body parameter*), `list[string]`, *Required* +- `"kb_ids"`: (*Body parameter*), `list[string]`, *Required* The IDs of the target datasets. #### Response @@ -7988,11 +8058,11 @@ curl --request POST \ ##### Request parameters -- `search_id`: (*Path parameter*), `string`, *Required* +- `search_id`: (*Path parameter*), `string`, *Required* The ID of the search app. -- `"question"`: (*Body parameter*), `string`, *Required* +- `"question"`: (*Body parameter*), `string`, *Required* The user question. -- `"kb_ids"`: (*Body parameter*), `list[string]` +- `"kb_ids"`: (*Body parameter*), `list[string]` Optional fallback dataset IDs when the search app config does not define them. #### Response diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index 41336ba17..0604c2c96 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -855,7 +855,7 @@ print("Async bulk parsing cancelled.") ### Add chunk ```python -Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk +Document.add_chunk(content:str, important_keywords:list[str] = [], questions:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk ``` Adds a chunk to the current document. @@ -870,6 +870,10 @@ The text content of the chunk. The key terms or phrases to tag with the chunk. +##### questions: `list[str]` + +Optional questions to use when embedding the chunk. + ##### image_base64: `string` A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one. @@ -889,6 +893,7 @@ A `Chunk` object contains the following attributes: - `content`: `string` The text content of the chunk. - `important_keywords`: `list[str]` A list of key terms or phrases tagged with the chunk. - `tag_kwd`: `list[str]` A list of tag keywords associated with the chunk. +- `questions`: `list[str]` A list of questions associated with the chunk. - `image_id`: `string` The image ID associated with the chunk (empty string if no image). - `create_time`: `string` The time when the chunk was created (added to the document). - `create_timestamp`: `float` The timestamp representing the creation time of the chunk, expressed in seconds since January 1, 1970. @@ -1023,16 +1028,19 @@ Updates content or configurations for the current chunk. #### Parameters -##### update_message: `dict[str, str|list[str]|int]` *Required* +##### update_message: `dict[str, str|list[str]|bool]` *Required* A dictionary representing the attributes to update, with the following keys: - `"content"`: `string` The text content of the chunk. - `"important_keywords"`: `list[str]` A list of key terms or phrases to tag with the chunk. +- `"questions"`: `list[str]` A list of questions associated with the chunk. - `"tag_kwd"`: `list[str]` A list of tag keywords to associate with the chunk. +- `"positions"`: `list` Updated source positions for the chunk. - `"available"`: `bool` The chunk's availability status in the dataset. Value options: - `False`: Unavailable - `True`: Available (default) +- `"image_base64"`: `string` Base64-encoded image content to associate with the chunk. #### Returns diff --git a/sdk/python/ragflow_sdk/modules/chunk.py b/sdk/python/ragflow_sdk/modules/chunk.py index 6ea9c1a8e..f6d1da09a 100644 --- a/sdk/python/ragflow_sdk/modules/chunk.py +++ b/sdk/python/ragflow_sdk/modules/chunk.py @@ -54,11 +54,11 @@ class Chunk(Base): def update(self, update_message: dict): - res = self.put(f"/datasets/{self.dataset_id}/documents/{self.document_id}/chunks/{self.id}", update_message) + res = self.patch(f"/datasets/{self.dataset_id}/documents/{self.document_id}/chunks/{self.id}", update_message) res = res.json() if res.get("code") != 0: raise ChunkUpdateError( code=res.get("code"), message=res.get("message"), details=res.get("details") - ) \ No newline at end of file + ) diff --git a/test/testcases/test_http_api/common.py b/test/testcases/test_http_api/common.py index 4f96843f7..9a84e9527 100644 --- a/test/testcases/test_http_api/common.py +++ b/test/testcases/test_http_api/common.py @@ -173,9 +173,15 @@ def list_chunks(auth, dataset_id, document_id, params=None): return res.json() +def get_chunk(auth, dataset_id, document_id, chunk_id): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}/{chunk_id}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.get(url=url, headers=HEADERS, auth=auth) + return res.json() + + def update_chunk(auth, dataset_id, document_id, chunk_id, payload=None): url = f"{HOST_ADDRESS}{CHUNK_API_URL}/{chunk_id}".format(dataset_id=dataset_id, document_id=document_id) - res = requests.put(url=url, headers=HEADERS, auth=auth, json=payload) + res = requests.patch(url=url, headers=HEADERS, auth=auth, json=payload) return res.json() diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/conftest.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/conftest.py index 48487ee9e..0a7990b3a 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/conftest.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/conftest.py @@ -18,17 +18,20 @@ from time import sleep import pytest -from common import batch_add_chunks, delete_all_chunks, list_documents, parse_documents -from utils import wait_for +from common import add_chunk, batch_add_chunks, delete_all_chunks -@wait_for(30, 1, "Document parsing timeout") -def condition(_auth, _dataset_id): - res = list_documents(_auth, _dataset_id) - for doc in res["data"]["docs"]: - if doc["run"] != "DONE": - return False - return True +def _add_baseline_chunk(auth, dataset_id, document_id): + add_chunk(auth, dataset_id, document_id, {"content": "ragflow test upload"}) + + +@pytest.fixture(scope="class") +def add_chunks(HttpApiAuth, add_document): + dataset_id, document_id = add_document + _add_baseline_chunk(HttpApiAuth, dataset_id, document_id) + chunk_ids = batch_add_chunks(HttpApiAuth, dataset_id, document_id, 4) + sleep(1) # issues/6487 + return dataset_id, document_id, chunk_ids @pytest.fixture(scope="function") @@ -39,8 +42,7 @@ def add_chunks_func(request, HttpApiAuth, add_document): request.addfinalizer(cleanup) dataset_id, document_id = add_document - parse_documents(HttpApiAuth, dataset_id, {"document_ids": [document_id]}) - condition(HttpApiAuth, dataset_id) + _add_baseline_chunk(HttpApiAuth, dataset_id, document_id) chunk_ids = batch_add_chunks(HttpApiAuth, dataset_id, document_id, 4) # issues/6487 sleep(1) diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py index d17540907..74e86f196 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py @@ -39,12 +39,8 @@ class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), - ( - RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", - ), + (None, 401, ""), + (RAGFlowHttpApiAuth(INVALID_API_TOKEN), 401, ""), ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py index 119974365..a64549338 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py @@ -26,12 +26,8 @@ class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), - ( - RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", - ), + (None, 401, ""), + (RAGFlowHttpApiAuth(INVALID_API_TOKEN), 401, ""), ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): @@ -58,7 +54,7 @@ class TestChunksDeletion: @pytest.mark.parametrize( "document_id, expected_code, expected_message", [ - (INVALID_ID_32, 100, f"""LookupError("Can't find the document with ID {INVALID_ID_32}!")"""), + (INVALID_ID_32, 102, f"You don't own the document {INVALID_ID_32}."), ], ) def test_invalid_document_id(self, HttpApiAuth, add_chunks_func, document_id, expected_code, expected_message): diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py index 4605f1221..198d83666 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py @@ -17,7 +17,7 @@ import os from concurrent.futures import ThreadPoolExecutor, as_completed import pytest -from common import batch_add_chunks, list_chunks +from common import batch_add_chunks, get_chunk, list_chunks from configs import INVALID_API_TOKEN, INVALID_ID_32 from libs.auth import RAGFlowHttpApiAuth @@ -27,12 +27,8 @@ class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), - ( - RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", - ), + (None, 401, ""), + (RAGFlowHttpApiAuth(INVALID_API_TOKEN), 401, ""), ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): @@ -139,6 +135,15 @@ class TestChunksList: else: assert res["message"] == expected_message + @pytest.mark.p1 + @pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6499") + def test_get_chunk(self, HttpApiAuth, add_chunks): + dataset_id, document_id, chunk_ids = add_chunks + res = get_chunk(HttpApiAuth, dataset_id, document_id, chunk_ids[0]) + assert res["code"] == 0 + assert res["data"]["id"] == chunk_ids[0] + assert res["data"]["doc_id"] == document_id + @pytest.mark.p3 def test_invalid_params(self, HttpApiAuth, add_chunks): dataset_id, document_id, _ = add_chunks diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py index cb5420f30..ff862b205 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -28,12 +28,8 @@ class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), - ( - RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", - ), + (None, 401, ""), + (RAGFlowHttpApiAuth(INVALID_API_TOKEN), 401, ""), ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py index 510e2c391..0d3ee68d1 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py @@ -14,6 +14,7 @@ # limitations under the License. # import asyncio +import inspect import importlib.util import sys from pathlib import Path @@ -309,6 +310,19 @@ def _load_doc_module(monkeypatch): return module +def _load_restful_chunk_module(monkeypatch): + repo_root = Path(__file__).resolve().parents[4] + helper_path = repo_root / "test" / "testcases" / "test_web_api" / "test_chunk_app" / "test_chunk_routes_unit.py" + spec = importlib.util.spec_from_file_location("test_restful_chunk_route_helpers", helper_path) + helper = importlib.util.module_from_spec(spec) + spec.loader.exec_module(helper) + return helper._load_chunk_api_module(monkeypatch) + + +def _route_core(func): + return inspect.unwrap(func) + + def _patch_send_file(monkeypatch, module): async def _fake_send_file(file_obj, **kwargs): return {"file": file_obj, "filename": kwargs.get("attachment_filename")} @@ -336,7 +350,7 @@ def _patch_docstore(monkeypatch, module, **kwargs): @pytest.mark.p2 class TestDocRoutesUnit: def test_chunk_positions_validation_error(self, monkeypatch): - module = _load_doc_module(monkeypatch) + module = _load_restful_chunk_module(monkeypatch) with pytest.raises(ValueError) as exc_info: module.Chunk(positions=[[1, 2, 3, 4]]) assert "length of 5" in str(exc_info.value) @@ -484,25 +498,44 @@ class TestDocRoutesUnit: assert res["code"] == 0 def test_list_chunks_branches(self, monkeypatch): - module = _load_doc_module(monkeypatch) + module = _load_restful_chunk_module(monkeypatch) monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) - res = _run(module.list_chunks.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) assert "don't own the dataset" in res["message"] monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) - res = _run(module.list_chunks.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) assert "don't own the document" in res["message"] monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc()]) + monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs({}))) + _patch_docstore(monkeypatch, module, index_exist=lambda *_args, **_kwargs: False) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) + assert res["code"] == 0 + assert res["data"]["total"] == 0 + assert res["data"]["chunks"] == [] + monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs({"id": "chunk-1"}))) _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: None) - res = _run(module.list_chunks.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) + assert res["code"] == module.RetCode.DATA_ERROR assert "Chunk not found" in res["message"] - _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"id_vec": [1], "content_with_weight_vec": [2]}) - res = _run(module.list_chunks.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "Chunk `chunk-1` not found." in res["message"] + _patch_docstore( + monkeypatch, + module, + get=lambda *_args, **_kwargs: { + "chunk_id": "chunk-1", + "content_with_weight": "x", + "doc_id": "other-doc", + "docnm_kwd": "doc", + "position_int": [[1, 2, 3, 4, 5]], + }, + ) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) + assert res["code"] == module.RetCode.DATA_ERROR + assert "Chunk not found" in res["message"] _patch_docstore( monkeypatch, @@ -515,29 +548,29 @@ class TestDocRoutesUnit: "position_int": [[1, 2, 3, 4, 5]], }, ) - res = _run(module.list_chunks.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.list_chunks)("tenant-1", "ds-1", "doc-1")) assert res["code"] == 0 assert res["data"]["total"] == 1 assert res["data"]["chunks"][0]["id"] == "chunk-1" def test_add_chunk_access_guard(self, monkeypatch): - module = _load_doc_module(monkeypatch) + module = _load_restful_chunk_module(monkeypatch) monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) - res = _run(module.add_chunk.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.add_chunk)("tenant-1", "ds-1", "doc-1")) assert "don't own the dataset" in res["message"] def test_rm_chunk_branches(self, monkeypatch): - module = _load_doc_module(monkeypatch) + module = _load_restful_chunk_module(monkeypatch) monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) - res = _run(module.rm_chunk.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.rm_chunk)("tenant-1", "ds-1", "doc-1")) assert "don't own the dataset" in res["message"] monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "get_by_ids", lambda _ids: []) - with pytest.raises(LookupError): - _run(module.rm_chunk.__wrapped__("tenant-1", "ds-1", "doc-1")) + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) + res = _run(_route_core(module.rm_chunk)("tenant-1", "ds-1", "doc-1")) + assert "don't own the document" in res["message"] - monkeypatch.setattr(module.DocumentService, "get_by_ids", lambda _ids: [_DummyDoc()]) + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc()]) monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({})) _patch_docstore( monkeypatch, @@ -545,32 +578,37 @@ class TestDocRoutesUnit: delete=lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("delete must not run for empty chunk ids")), ) monkeypatch.setattr(module.DocumentService, "decrement_chunk_num", lambda *_args, **_kwargs: None) - res = _run(module.rm_chunk.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.rm_chunk)("tenant-1", "ds-1", "doc-1")) assert res["code"] == 0 monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_ids": ["c1", "c1"]})) monkeypatch.setattr(module, "check_duplicate_ids", lambda _ids, _kind: (["c1"], ["Duplicate chunk ids: c1"])) _patch_docstore(monkeypatch, module, delete=lambda *_args, **_kwargs: 1) - res = _run(module.rm_chunk.__wrapped__("tenant-1", "ds-1", "doc-1")) + res = _run(_route_core(module.rm_chunk)("tenant-1", "ds-1", "doc-1")) assert res["code"] == 0 assert res["data"]["errors"] == ["Duplicate chunk ids: c1"] def test_update_chunk_branches(self, monkeypatch): - module = _load_doc_module(monkeypatch) - _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: None) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) - assert "Can't find this chunk" in res["message"] - - _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"content_with_weight": "q\na"}) + module = _load_restful_chunk_module(monkeypatch) monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("chunk lookup must not run before access check"))) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert "don't own the dataset" in res["message"] monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert "don't own the document" in res["message"] + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc()]) + _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: None) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) + assert "Can't find this chunk" in res["message"] + + _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"doc_id": "other-doc", "content_with_weight": "q\na"}) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) + assert "Can't find this chunk" in res["message"] + doc = _DummyDoc(parser_id="naive") monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc]) monkeypatch.setattr(module.rag_tokenizer, "tokenize", lambda text: text or "") @@ -584,25 +622,25 @@ class TestDocRoutesUnit: return [np.array([0.2, 0.8]), np.array([0.3, 0.7])], 1 monkeypatch.setattr(module.TenantLLMService, "model_instance", lambda *_args, **_kwargs: _EmbedModel()) + _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"doc_id": "doc-1", "content_with_weight": "x"}, update=lambda *_args, **_kwargs: None) monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"positions": "bad"})) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert "`positions` should be a list" in res["message"] - _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"content_with_weight": "x"}, update=lambda *_args, **_kwargs: None) monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"positions": [[1, 2, 3, 4, 5]]})) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert res["code"] == 0 qa_doc = _DummyDoc(parser_id=module.ParserType.QA) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [qa_doc]) monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"content": "no-separator"})) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert "Q&A must be separated" in res["message"] monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"content": "Q?\nA!"})) - _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"content_with_weight": "Q?\nA!"}, update=lambda *_args, **_kwargs: None) + _patch_docstore(monkeypatch, module, get=lambda *_args, **_kwargs: {"doc_id": "doc-1", "content_with_weight": "Q?\nA!"}, update=lambda *_args, **_kwargs: None) monkeypatch.setattr(module, "beAdoc", lambda d, *_args, **_kwargs: d) - res = _run(module.update_chunk.__wrapped__("tenant-1", "ds-1", "doc-1", "chunk-1")) + res = _run(_route_core(module.update_chunk)("tenant-1", "ds-1", "doc-1", "chunk-1")) assert res["code"] == 0 def test_retrieval_validation_matrix(self, monkeypatch): diff --git a/test/testcases/test_web_api/conftest.py b/test/testcases/test_web_api/conftest.py index df57be3aa..1854103e3 100644 --- a/test/testcases/test_web_api/conftest.py +++ b/test/testcases/test_web_api/conftest.py @@ -157,17 +157,17 @@ def add_document(request, WebApiAuth, add_dataset, ragflow_tmp_dir): @pytest.fixture(scope="class") def add_chunks(request, WebApiAuth, add_document): def cleanup(): - res = list_chunks(WebApiAuth, {"doc_id": document_id}) + res = list_chunks(WebApiAuth, dataset_id, document_id) if res["code"] == 0: - chunk_ids = [chunk["chunk_id"] for chunk in res["data"]["chunks"]] - delete_chunks(WebApiAuth, {"doc_id": document_id, "chunk_ids": chunk_ids}) + chunk_ids = [chunk["id"] for chunk in res["data"]["chunks"]] + delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids}) request.addfinalizer(cleanup) - kb_id, document_id = add_document + dataset_id, document_id = add_document parse_documents(WebApiAuth, {"doc_ids": [document_id], "run": "1"}) - condition(WebApiAuth, kb_id) - chunk_ids = batch_add_chunks(WebApiAuth, document_id, 4) + condition(WebApiAuth, dataset_id) + chunk_ids = batch_add_chunks(WebApiAuth, dataset_id, document_id, 4) # issues/6487 sleep(1) - return kb_id, document_id, chunk_ids + return dataset_id, document_id, chunk_ids diff --git a/test/testcases/test_web_api/test_chunk_app/conftest.py b/test/testcases/test_web_api/test_chunk_app/conftest.py index 0b413c75f..ebbe74f02 100644 --- a/test/testcases/test_web_api/test_chunk_app/conftest.py +++ b/test/testcases/test_web_api/test_chunk_app/conftest.py @@ -34,16 +34,16 @@ def condition(_auth, _kb_id): @pytest.fixture(scope="function") def add_chunks_func(request, WebApiAuth, add_document): def cleanup(): - res = list_chunks(WebApiAuth, {"doc_id": document_id}) - chunk_ids = [chunk["chunk_id"] for chunk in res["data"]["chunks"]] - delete_chunks(WebApiAuth, {"doc_id": document_id, "chunk_ids": chunk_ids}) + res = list_chunks(WebApiAuth, dataset_id, document_id) + chunk_ids = [chunk["id"] for chunk in res["data"]["chunks"]] + delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids}) request.addfinalizer(cleanup) - kb_id, document_id = add_document + dataset_id, document_id = add_document parse_documents(WebApiAuth, {"doc_ids": [document_id], "run": "1"}) - condition(WebApiAuth, kb_id) - chunk_ids = batch_add_chunks(WebApiAuth, document_id, 4) + condition(WebApiAuth, dataset_id) + chunk_ids = batch_add_chunks(WebApiAuth, dataset_id, document_id, 4) # issues/6487 sleep(1) - return kb_id, document_id, chunk_ids + return dataset_id, document_id, chunk_ids diff --git a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py index 3f5ab6b11..3a88b7c40 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py +++ b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py @@ -15,7 +15,7 @@ # import asyncio -import base64 +import inspect import importlib.util import json import sys @@ -73,6 +73,7 @@ class _DummyRetCode: DATA_ERROR = 102 EXCEPTION_ERROR = 100 OPERATING_ERROR = 103 + NOT_FOUND = 404 class _DummyParserType: @@ -81,7 +82,7 @@ class _DummyParserType: class _DummyRetriever: - async def search(self, query, _index_name, _kb_ids, highlight=None): + async def search(self, query, _index_name, _kb_ids, *args, highlight=None, **kwargs): class _SRes: total = 1 ids = ["chunk-1"] @@ -138,6 +139,9 @@ class _DummyDocStore: def insert(self, docs, *_args, **_kwargs): self.inserted.extend(docs) + def index_exist(self, *_args, **_kwargs): + return True + class _DummyStorage: def __init__(self): @@ -179,6 +183,10 @@ def _run(coro): return asyncio.run(coro) +def _route_core(func): + return inspect.unwrap(func) + + def _load_chunk_module(monkeypatch): repo_root = Path(__file__).resolve().parents[4] @@ -279,15 +287,33 @@ def _load_chunk_module(monkeypatch): api_utils_mod = ModuleType("api.utils.api_utils") api_utils_mod.get_json_result = lambda data=None, message="", code=0: {"code": code, "message": message, "data": data} api_utils_mod.get_data_error_result = lambda message="": {"code": _DummyRetCode.DATA_ERROR, "message": message, "data": False} + api_utils_mod.get_result = lambda data=None, message="", code=0: {"code": code, "message": message, "data": data} + api_utils_mod.get_error_data_result = lambda message="": {"code": _DummyRetCode.DATA_ERROR, "message": message, "data": False} api_utils_mod.server_error_response = lambda exc: {"code": _DummyRetCode.EXCEPTION_ERROR, "message": repr(exc), "data": False} api_utils_mod.validate_request = lambda *_args, **_kwargs: (lambda fn: fn) + api_utils_mod.add_tenant_id_to_kwargs = lambda func: func + api_utils_mod.check_duplicate_ids = lambda ids, _kind: (list(dict.fromkeys(ids)), [] if len(ids) == len(set(ids)) else [f"Duplicate {_kind} ids"]) api_utils_mod.get_request_json = lambda: _AwaitableValue({}) monkeypatch.setitem(sys.modules, "api.utils.api_utils", api_utils_mod) + image_utils_mod = ModuleType("api.utils.image_utils") + image_utils_mod.store_chunk_image = lambda *_args, **_kwargs: None + monkeypatch.setitem(sys.modules, "api.utils.image_utils", image_utils_mod) + services_pkg = ModuleType("api.db.services") services_pkg.__path__ = [] monkeypatch.setitem(sys.modules, "api.db.services", services_pkg) + joint_services_pkg = ModuleType("api.db.joint_services") + joint_services_pkg.__path__ = [] + monkeypatch.setitem(sys.modules, "api.db.joint_services", joint_services_pkg) + + tenant_model_service_mod = ModuleType("api.db.joint_services.tenant_model_service") + tenant_model_service_mod.get_model_config_by_id = lambda *_args, **_kwargs: {"llm_name": "embed", "model_type": "embedding"} + tenant_model_service_mod.get_model_config_by_type_and_name = lambda *_args, **_kwargs: {"llm_name": "embed", "model_type": "embedding"} + tenant_model_service_mod.get_tenant_default_model_by_type = lambda *_args, **_kwargs: {"llm_name": "chat", "model_type": "chat"} + monkeypatch.setitem(sys.modules, "api.db.joint_services.tenant_model_service", tenant_model_service_mod) + document_service_mod = ModuleType("api.db.services.document_service") class _DocumentService: @@ -302,6 +328,18 @@ def _load_chunk_module(monkeypatch): def get_by_id(doc_id): return True, _DummyDoc(doc_id=doc_id, parser_id=_DummyParserType.NAIVE) + @staticmethod + def query(**kwargs): + return [_DummyDoc(doc_id=kwargs.get("id", "doc-1"), kb_id=kwargs.get("kb_id", "kb-1"))] + + @staticmethod + def get_by_ids(ids): + return [_DummyDoc(doc_id=ids[0] if ids else "doc-1")] + + @staticmethod + def delete_chunk_images(*_args, **_kwargs): + return None + @staticmethod def get_embd_id(_doc_id): return "embed-1" @@ -334,6 +372,10 @@ def _load_chunk_module(monkeypatch): def get_kb_ids(_tenant_id): return ["kb-1"] + @staticmethod + def accessible(**_kwargs): + return True + @staticmethod def get_by_id(_kb_id): return True, SimpleNamespace(pagerank=0.6, tenant_embd_id=2, tenant_llm_id=1) @@ -415,6 +457,10 @@ def _load_chunk_module(monkeypatch): def increase_usage_by_id(model_id, used_tokens): return True + @staticmethod + def model_instance(_model_config): + return _DummyLLMBundle() + class _TenantService: @staticmethod def get_by_id(tenant_id): @@ -455,6 +501,19 @@ def _load_chunk_module(monkeypatch): return module +def _load_chunk_api_module(monkeypatch): + _load_chunk_module(monkeypatch) + repo_root = Path(__file__).resolve().parents[4] + module_name = "test_chunk_api_routes_unit_module" + module_path = repo_root / "api" / "apps" / "restful_apis" / "chunk_api.py" + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + module.manager = _DummyManager() + monkeypatch.setitem(sys.modules, module_name, module) + spec.loader.exec_module(module) + return module + + def _set_request_json(monkeypatch, module, payload): monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue(payload)) @@ -465,347 +524,133 @@ def set_tenant_info(): @pytest.mark.p2 -def test_list_chunk_exception_branches_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) +def test_restful_chunk_list_get_and_delete_unit(monkeypatch): + module = _load_chunk_api_module(monkeypatch) + module.request = SimpleNamespace(args={"keywords": "chunk", "available": "true"}, headers={}) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "keywords": "chunk", "available_int": 0}) - res = _run(module.list_chunk()) + res = _run(_route_core(module.list_chunks)("tenant-1", "kb-1", "doc-1")) assert res["code"] == 0, res assert res["data"]["total"] == 1, res - assert res["data"]["chunks"][0]["available_int"] == 1, res + assert res["data"]["chunks"][0]["id"] == "chunk-1", res + assert res["data"]["chunks"][0]["available"] is True, res - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "") - _set_request_json(monkeypatch, module, {"doc_id": "doc-1"}) - res = _run(module.list_chunk()) - assert res["code"] == module.RetCode.DATA_ERROR, res - assert res["message"] == "Tenant not found!", res - - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant-1") - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1"}) - res = _run(module.list_chunk()) - assert res["message"] == "Document not found!", res - - async def _raise_not_found(*_args, **_kwargs): - raise Exception("x not_found y") - - monkeypatch.setattr(module.settings.retriever, "search", _raise_not_found) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, _DummyDoc())) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1"}) - res = _run(module.list_chunk()) - assert res["code"] == module.RetCode.DATA_ERROR, res - assert res["message"] == "No chunk found!", res - - async def _raise_generic(*_args, **_kwargs): - raise RuntimeError("boom") - - monkeypatch.setattr(module.settings.retriever, "search", _raise_generic) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1"}) - res = _run(module.list_chunk()) - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "boom" in res["message"], res - - -@pytest.mark.p2 -def test_get_chunk_sanitize_and_exception_matrix_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) - module.request = SimpleNamespace(args={"chunk_id": "chunk-1"}, headers={}) - - res = module.get() + res = _run(_route_core(module.get_chunk)("tenant-1", "kb-1", "doc-1", "chunk-1")) assert res["code"] == 0, res assert "q_2_vec" not in res["data"], res assert "content_tks" not in res["data"], res assert "content_ltks" not in res["data"], res assert "content_sm_ltks" not in res["data"], res - monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: []) - res = module.get() - assert res["message"] == "Tenant not found!", res - - monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [_DummyTenant("tenant-1")]) - module.settings.docStoreConn.chunk = None - res = module.get() - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "Chunk not found" in res["message"], res - - def _raise_not_found(*_args, **_kwargs): - raise Exception("NotFoundError: chunk-1") - - monkeypatch.setattr(module.settings.docStoreConn, "get", _raise_not_found) - res = module.get() - assert res["code"] == module.RetCode.DATA_ERROR, res - assert res["message"] == "Chunk not found!", res - - def _raise_generic(*_args, **_kwargs): - raise RuntimeError("get boom") - - monkeypatch.setattr(module.settings.docStoreConn, "get", _raise_generic) - res = module.get() - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "get boom" in res["message"], res + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_ids": ["chunk-1"]})) + res = _run(_route_core(module.rm_chunk)("tenant-1", "kb-1", "doc-1")) + assert res["code"] == 0, res + assert module.settings.docStoreConn.deleted_inputs[-1]["doc_id"] == "doc-1" @pytest.mark.p2 -def test_set_chunk_bytes_qa_image_and_guard_matrix_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) - - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": 1}) - with pytest.raises(TypeError, match="expected string or bytes-like object"): - _run(module.set()) - - _set_request_json( - monkeypatch, - module, - {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc", "important_kwd": "bad"}, - ) - res = _run(module.set()) - assert res["message"] == "`important_kwd` should be a list", res - - _set_request_json( - monkeypatch, - module, - {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc", "question_kwd": "bad"}, - ) - res = _run(module.set()) - assert res["message"] == "`question_kwd` should be a list", res - - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "") - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc"}) - res = _run(module.set()) - assert res["message"] == "Tenant not found!", res - - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant-1") - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc"}) - res = _run(module.set()) - assert res["message"] == "Document not found!", res +def test_restful_chunk_add_update_and_switch_unit(monkeypatch): + module = _load_chunk_api_module(monkeypatch) + module.request = SimpleNamespace(args={}, headers={}) monkeypatch.setattr( - module.DocumentService, - "get_by_id", - lambda _doc_id: (True, _DummyDoc(doc_id="doc-1", parser_id=module.ParserType.NAIVE)), - ) - _set_request_json( - monkeypatch, module, - {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc", "tag_feas": [0.1]}, + "get_request_json", + lambda: _AwaitableValue( + { + "content": "chunk", + "important_keywords": ["i1"], + "questions": ["q1"], + "tag_kwd": ["tag"], + "tag_feas": {"tag": 0.2}, + } + ), ) - res = _run(module.set()) - assert "`tag_feas` must be an object mapping string tags to finite numeric scores" in res["message"], res - - _set_request_json( - monkeypatch, - module, - { - "doc_id": "doc-1", - "chunk_id": "chunk-1", - "content_with_weight": b"bytes-content", - "important_kwd": ["important"], - "question_kwd": ["question"], - "tag_kwd": ["tag"], - "tag_feas": {"tag": 0.1}, - "available_int": 0, - }, - ) - res = _run(module.set()) + res = _run(_route_core(module.add_chunk)("tenant-1", "kb-1", "doc-1")) assert res["code"] == 0, res - assert module.settings.docStoreConn.updated[-1][1]["content_with_weight"] == "bytes-content" + assert res["data"]["chunk"]["content"] == "chunk", res + assert module.settings.docStoreConn.inserted, "insert should be called" + assert module.DocumentService.increment_calls, "increment_chunk_num should be called" monkeypatch.setattr( - module.DocumentService, - "get_by_id", - lambda _doc_id: (True, _DummyDoc(doc_id="doc-1", parser_id=module.ParserType.QA)), - ) - _set_request_json( - monkeypatch, module, - { - "doc_id": "doc-1", - "chunk_id": "chunk-2", - "content_with_weight": "Q:Question\nA:Answer", - "image_base64": base64.b64encode(b"image").decode("utf-8"), - "img_id": "bucket-name", - }, + "get_request_json", + lambda: _AwaitableValue( + { + "content": "updated chunk", + "important_keywords": ["i2"], + "questions": ["q2"], + "tag_kwd": ["tag2"], + "positions": [[1, 2, 3, 4, 5]], + "available": False, + } + ), ) - res = _run(module.set()) + res = _run(_route_core(module.update_chunk)("tenant-1", "kb-1", "doc-1", "chunk-1")) assert res["code"] == 0, res - assert module.settings.STORAGE_IMPL.put_calls, "image storage branch should be called" + updated = module.settings.docStoreConn.updated[-1][1] + assert updated["content_with_weight"] == "updated chunk" + assert updated["available_int"] == 0 + assert updated["position_int"] == [[1, 2, 3, 4, 5]] - async def _raise_thread_pool(_func): - raise RuntimeError("set tp boom") - - monkeypatch.setattr(module, "thread_pool_exec", _raise_thread_pool) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc"}) - res = _run(module.set()) - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "set tp boom" in res["message"], res - - -@pytest.mark.p2 -def test_switch_chunk_success_failure_and_exception_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"], "available_int": 1}) - res = _run(module.switch()) - assert res["message"] == "Document not found!", res - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, _DummyDoc())) - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant-1") - monkeypatch.setattr(module.settings.docStoreConn, "update", lambda *_args, **_kwargs: False) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1", "c2"], "available_int": 0}) - res = _run(module.switch()) - assert res["message"] == "Index updating failure", res - - monkeypatch.setattr(module.settings.docStoreConn, "update", lambda *_args, **_kwargs: True) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1", "c2"], "available_int": 1}) - res = _run(module.switch()) + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_ids": ["chunk-1"], "available": True})) + res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1")) assert res["code"] == 0, res assert res["data"] is True, res - async def _raise_thread_pool(_func): - raise RuntimeError("switch tp boom") - - monkeypatch.setattr(module, "thread_pool_exec", _raise_thread_pool) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"], "available_int": 1}) - res = _run(module.switch()) - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "switch tp boom" in res["message"], res - @pytest.mark.p2 -def test_rm_chunk_delete_exception_partial_compensation_and_cleanup_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) +def test_restful_chunk_guard_branches_unit(monkeypatch): + module = _load_chunk_api_module(monkeypatch) + module.request = SimpleNamespace(args={}, headers={}) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"]}) - res = _run(module.rm()) - assert res["message"] == "Document not found!", res + monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) + res = _run(_route_core(module.list_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["message"] == "You don't own the dataset kb-1.", res - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": []}) - monkeypatch.setattr( - module.DocumentService, - "get_by_id", - lambda _doc_id: (_ for _ in ()).throw(AssertionError("get_by_id must not run for empty delete payload")), - ) - monkeypatch.setattr( - module.settings.docStoreConn, - "delete", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("delete must not run for empty delete payload")), - ) - res = _run(module.rm()) - assert res["code"] == 0, res + monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) + res = _run(_route_core(module.list_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["message"] == "You don't own the document doc-1.", res - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, _DummyDoc())) + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc()]) + module.request = SimpleNamespace(args={"id": "chunk-1"}, headers={}) + module.settings.docStoreConn.chunk = None + res = _run(_route_core(module.list_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["code"] == module.RetCode.DATA_ERROR, res + assert "Chunk not found" in res["message"], res - def _raise_delete(*_args, **_kwargs): - raise RuntimeError("delete boom") + module.settings.docStoreConn.chunk = { + "id": "chunk-1", + "doc_id": "other-doc", + "content_with_weight": "chunk", + "docnm_kwd": "Doc", + } + res = _run(_route_core(module.list_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["code"] == module.RetCode.DATA_ERROR, res + assert "Chunk not found" in res["message"], res - monkeypatch.setattr(module.settings.docStoreConn, "delete", _raise_delete) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"]}) - res = _run(module.rm()) - assert res["message"] == "Chunk deleting failure", res + module.settings.docStoreConn.chunk = None + module.request = SimpleNamespace(args={}, headers={}) + res = _run(_route_core(module.get_chunk)("tenant-1", "kb-1", "doc-1", "chunk-1")) + assert res["code"] == module.RetCode.DATA_ERROR, res + assert "Chunk not found" in res["message"], res - def _delete(condition, *_args, **_kwargs): - module.settings.docStoreConn.deleted_inputs.append(condition) - if not module.settings.docStoreConn.to_delete: - return 0 - return module.settings.docStoreConn.to_delete.pop(0) + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"content": ""})) + res = _run(_route_core(module.add_chunk)("tenant-1", "kb-1", "doc-1")) + assert res["message"] == "`content` is required", res - module.settings.docStoreConn.to_delete = [0] - monkeypatch.setattr(module.settings.docStoreConn, "delete", _delete) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"]}) - res = _run(module.rm()) - assert res["message"] == "Index updating failure", res + module.settings.docStoreConn.chunk = {"id": "chunk-1", "doc_id": "doc-1", "content_with_weight": "chunk"} + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"important_keywords": "bad"})) + res = _run(_route_core(module.update_chunk)("tenant-1", "kb-1", "doc-1", "chunk-1")) + assert res["message"] == "`important_keywords` should be a list", res - module.settings.docStoreConn.to_delete = [1, 2] - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1", "c2", "c3"]}) - res = _run(module.rm()) - assert res["code"] == 0, res - assert module.DocumentService.decrement_calls, "decrement_chunk_num should be called" - assert len(module.settings.STORAGE_IMPL.rm_calls) >= 1 + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_ids": []})) + res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["message"] == "`chunk_ids` is required.", res - module.settings.docStoreConn.to_delete = [1] - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": "c1"}) - res = _run(module.rm()) - assert res["code"] == 0, res - - async def _raise_thread_pool(_func): - raise RuntimeError("rm tp boom") - - monkeypatch.setattr(module, "thread_pool_exec", _raise_thread_pool) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "chunk_ids": ["c1"]}) - res = _run(module.rm()) - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "rm tp boom" in res["message"], res - - -@pytest.mark.p2 -def test_create_chunk_guards_pagerank_and_success_unit(monkeypatch): - module = _load_chunk_module(monkeypatch) - module.request = SimpleNamespace(headers={"X-Request-ID": "req-1"}, args={}) - - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk", "important_kwd": "bad"}) - res = _run(module.create()) - assert res["message"] == "`important_kwd` is required to be a list", res - - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk", "question_kwd": "bad"}) - res = _run(module.create()) - assert res["message"] == "`question_kwd` is required to be a list", res - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk"}) - res = _run(module.create()) - assert res["message"] == "Document not found!", res - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, _DummyDoc(doc_id="doc-1"))) - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "") - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk"}) - res = _run(module.create()) - assert res["message"] == "Tenant not found!", res - - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant-1") - monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None)) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk"}) - res = _run(module.create()) - assert res["message"] == "Knowledgebase not found!", res - - monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, SimpleNamespace(pagerank=0.8))) - _set_request_json( - monkeypatch, - module, - {"doc_id": "doc-1", "content_with_weight": "chunk", "tag_feas": [0.2]}, - ) - res = _run(module.create()) - assert "`tag_feas` must be an object mapping string tags to finite numeric scores" in res["message"], res - - _set_request_json( - monkeypatch, - module, - { - "doc_id": "doc-1", - "content_with_weight": "chunk", - "important_kwd": ["i1"], - "question_kwd": ["q1"], - "tag_feas": {"tag": 0.2}, - }, - ) - res = _run(module.create()) - assert res["code"] == 0, res - assert res["data"]["chunk_id"], res - assert module.settings.docStoreConn.inserted, "insert should be called" - inserted = module.settings.docStoreConn.inserted[-1] - assert "pagerank_flt" in inserted - assert module.DocumentService.increment_calls, "increment_chunk_num should be called" - - async def _raise_thread_pool(_func): - raise RuntimeError("create tp boom") - - monkeypatch.setattr(module, "thread_pool_exec", _raise_thread_pool) - _set_request_json(monkeypatch, module, {"doc_id": "doc-1", "content_with_weight": "chunk"}) - res = _run(module.create()) - assert res["code"] == module.RetCode.EXCEPTION_ERROR, res - assert "create tp boom" in res["message"], res + monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_ids": ["chunk-1"]})) + res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1")) + assert res["message"] == "`available_int` or `available` is required.", res @pytest.mark.p2 diff --git a/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py b/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py index 38331af20..f9e6f7607 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py +++ b/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py @@ -16,24 +16,28 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest -from test_common import add_chunk, delete_document, get_chunk, list_chunks from configs import INVALID_API_TOKEN from libs.auth import RAGFlowWebApiAuth +from test_common import add_chunk, delete_document, get_chunk, list_chunks -def validate_chunk_details(auth, kb_id, doc_id, payload, res): - chunk_id = res["data"]["chunk_id"] - res = get_chunk(auth, {"chunk_id": chunk_id}) - assert res["code"] == 0, res - chunk = res["data"] - assert chunk["doc_id"] == doc_id - assert chunk["kb_id"] == kb_id - assert chunk["content_with_weight"] == payload["content_with_weight"] - if "important_kwd" in payload: - assert chunk["important_kwd"] == payload["important_kwd"] - if "question_kwd" in payload: - expected = [str(q).strip() for q in payload.get("question_kwd", [])] - assert chunk["question_kwd"] == expected +def validate_chunk_details(auth, dataset_id, document_id, payload, res): + chunk = res["data"]["chunk"] + assert chunk["dataset_id"] == dataset_id + assert chunk["document_id"] == document_id + assert chunk["content"] == payload["content"] + if "important_keywords" in payload: + assert chunk["important_keywords"] == payload["important_keywords"] + if "questions" in payload: + expected = [str(q).strip() for q in payload.get("questions", []) if str(q).strip()] + assert chunk["questions"] == expected + if "tag_kwd" in payload: + assert chunk["tag_kwd"] == payload["tag_kwd"] + + fetched = get_chunk(auth, dataset_id, document_id, chunk["id"]) + assert fetched["code"] == 0, fetched + assert fetched["data"]["id"] == chunk["id"] + assert fetched["data"]["doc_id"] == document_id @pytest.mark.p2 @@ -46,7 +50,7 @@ class TestAuthorization: ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): - res = add_chunk(invalid_auth) + res = add_chunk(invalid_auth, "dataset_id", "document_id", {"content": "chunk test"}) assert res["code"] == expected_code, res assert res["message"] == expected_message, res @@ -56,33 +60,22 @@ class TestAddChunk: @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"content_with_weight": None}, 100, """TypeError("unsupported operand type(s) for +: 'NoneType' and 'str'")"""), - ({"content_with_weight": ""}, 100, """Exception('Error: 413 - {"error":"Input validation error: `inputs` cannot be empty","error_type":"Validation"}')"""), - pytest.param( - {"content_with_weight": 1}, - 100, - """TypeError("unsupported operand type(s) for +: 'int' and 'str'")""", - marks=pytest.mark.skip, - ), - ({"content_with_weight": "a"}, 0, ""), - ({"content_with_weight": " "}, 0, ""), - ({"content_with_weight": "\n!?。;!?\"'"}, 0, ""), + ({"content": None}, 102, "`content` is required"), + ({"content": ""}, 102, "`content` is required"), + ({"content": "a"}, 0, ""), + ({"content": " "}, 102, "`content` is required"), + ({"content": "\n!?。;!?\"'"}, 0, ""), ], ) def test_content(self, WebApiAuth, add_document, payload, expected_code, expected_message): - kb_id, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] == 0: - chunks_count = res["data"]["doc"]["chunk_num"] - else: - chunks_count = 0 - res = add_chunk(WebApiAuth, {**payload, "doc_id": doc_id}) + dataset_id, document_id = add_document + chunks_count = list_chunks(WebApiAuth, dataset_id, document_id)["data"]["doc"]["chunk_count"] + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == expected_code, res if expected_code == 0: - validate_chunk_details(WebApiAuth, kb_id, doc_id, payload, res) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 1, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) + res = list_chunks(WebApiAuth, dataset_id, document_id) + assert res["data"]["doc"]["chunk_count"] == chunks_count + 1, res else: assert res["message"] == expected_message, res @@ -90,32 +83,20 @@ class TestAddChunk: @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"content_with_weight": "chunk test", "important_kwd": ["a", "b", "c"]}, 0, ""), - ({"content_with_weight": "chunk test", "important_kwd": [""]}, 0, ""), - ( - {"content_with_weight": "chunk test", "important_kwd": [1]}, - 100, - "TypeError('sequence item 0: expected str instance, int found')", - ), - ({"content_with_weight": "chunk test", "important_kwd": ["a", "a"]}, 0, ""), - ({"content_with_weight": "chunk test", "important_kwd": "abc"}, 102, "`important_kwd` is required to be a list"), - ({"content_with_weight": "chunk test", "important_kwd": 123}, 102, "`important_kwd` is required to be a list"), + ({"content": "chunk test", "important_keywords": ["a", "b", "c"]}, 0, ""), + ({"content": "chunk test", "important_keywords": [""]}, 0, ""), + ({"content": "chunk test", "important_keywords": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), + ({"content": "chunk test", "important_keywords": ["a", "a"]}, 0, ""), + ({"content": "chunk test", "important_keywords": "abc"}, 102, "`important_keywords` is required to be a list"), + ({"content": "chunk test", "important_keywords": 123}, 102, "`important_keywords` is required to be a list"), ], ) def test_important_keywords(self, WebApiAuth, add_document, payload, expected_code, expected_message): - kb_id, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] == 0: - chunks_count = res["data"]["doc"]["chunk_num"] - else: - chunks_count = 0 - res = add_chunk(WebApiAuth, {**payload, "doc_id": doc_id}) + dataset_id, document_id = add_document + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == expected_code, res if expected_code == 0: - validate_chunk_details(WebApiAuth, kb_id, doc_id, payload, res) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 1, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) else: assert res["message"] == expected_message, res @@ -123,130 +104,95 @@ class TestAddChunk: @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"content_with_weight": "chunk test", "question_kwd": ["a", "b", "c"]}, 0, ""), - ({"content_with_weight": "chunk test", "question_kwd": [""]}, 100, """Exception('Error: 413 - {"error":"Input validation error: `inputs` cannot be empty","error_type":"Validation"}')"""), - ({"content_with_weight": "chunk test", "question_kwd": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), - ({"content_with_weight": "chunk test", "question_kwd": ["a", "a"]}, 0, ""), - ({"content_with_weight": "chunk test", "question_kwd": "abc"}, 102, "`question_kwd` is required to be a list"), - ({"content_with_weight": "chunk test", "question_kwd": 123}, 102, "`question_kwd` is required to be a list"), + ({"content": "chunk test", "questions": ["a", "b", "c"]}, 0, ""), + ({"content": "chunk test", "questions": [""]}, 0, ""), + ({"content": "chunk test", "questions": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), + ({"content": "chunk test", "questions": ["a", "a"]}, 0, ""), + ({"content": "chunk test", "questions": "abc"}, 102, "`questions` is required to be a list"), + ({"content": "chunk test", "questions": 123}, 102, "`questions` is required to be a list"), ], ) def test_questions(self, WebApiAuth, add_document, payload, expected_code, expected_message): - kb_id, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] == 0: - chunks_count = res["data"]["doc"]["chunk_num"] - else: - chunks_count = 0 - res = add_chunk(WebApiAuth, {**payload, "doc_id": doc_id}) + dataset_id, document_id = add_document + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == expected_code, res if expected_code == 0: - validate_chunk_details(WebApiAuth, kb_id, doc_id, payload, res) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 1, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) else: assert res["message"] == expected_message, res @pytest.mark.p2 - def test_get_chunk_not_found(self, WebApiAuth): - res = get_chunk(WebApiAuth, {"chunk_id": "missing_chunk_id"}) - assert res["code"] != 0, res - assert "Chunk not found" in res["message"], res + def test_add_chunk_with_tag_fields(self, WebApiAuth, add_document): + dataset_id, document_id = add_document + payload = { + "content": "chunk with tags", + "tag_kwd": ["tag1", "tag2"], + "important_keywords": ["tag"], + "questions": ["question"], + } + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) + assert res["code"] == 0, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) @pytest.mark.p2 - def test_create_chunk_with_tag_fields(self, WebApiAuth, add_document): - _, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] == 0: - chunks_count = res["data"]["doc"]["chunk_num"] - else: - chunks_count = 0 - - payload = { - "doc_id": doc_id, - "content_with_weight": "chunk with tags", - "tag_feas": {"tag1": 0.1, "tag2": 0.2}, - "important_kwd": ["tag"], - "question_kwd": ["question"], - } - res = add_chunk(WebApiAuth, payload) - assert res["code"] == 0, res - assert res["data"]["chunk_id"], res - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 1, res + def test_get_chunk_not_found(self, WebApiAuth, add_document): + dataset_id, document_id = add_document + res = get_chunk(WebApiAuth, dataset_id, document_id, "missing_chunk_id") + assert res["code"] == 102, res + assert "Chunk not found" in res["message"], res @pytest.mark.p3 @pytest.mark.parametrize( - "doc_id, expected_code, expected_message", + "document_id, expected_code, expected_message", [ - ("", 102, "Document not found!"), - ("invalid_document_id", 102, "Document not found!"), + ("invalid_document_id", 102, "You don't own the document invalid_document_id."), ], ) - def test_invalid_document_id(self, WebApiAuth, add_document, doc_id, expected_code, expected_message): - _, _ = add_document - res = add_chunk(WebApiAuth, {"doc_id": doc_id, "content_with_weight": "chunk test"}) + def test_invalid_document_id(self, WebApiAuth, add_document, document_id, expected_code, expected_message): + dataset_id, _ = add_document + res = add_chunk(WebApiAuth, dataset_id, document_id, {"content": "chunk test"}) assert res["code"] == expected_code, res assert res["message"] == expected_message, res @pytest.mark.p3 def test_repeated_add_chunk(self, WebApiAuth, add_document): - payload = {"content_with_weight": "chunk test"} - kb_id, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] != 0: - assert False, res - chunks_count = res["data"]["doc"]["chunk_num"] + payload = {"content": "chunk test"} + dataset_id, document_id = add_document + chunks_count = list_chunks(WebApiAuth, dataset_id, document_id)["data"]["doc"]["chunk_count"] - res = add_chunk(WebApiAuth, {**payload, "doc_id": doc_id}) + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == 0, res - validate_chunk_details(WebApiAuth, kb_id, doc_id, payload, res) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] != 0: - assert False, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 1, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) - res = add_chunk(WebApiAuth, {**payload, "doc_id": doc_id}) + res = add_chunk(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == 0, res - validate_chunk_details(WebApiAuth, kb_id, doc_id, payload, res) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] != 0: - assert False, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + 2, res + validate_chunk_details(WebApiAuth, dataset_id, document_id, payload, res) + + res = list_chunks(WebApiAuth, dataset_id, document_id) + assert res["data"]["doc"]["chunk_count"] == chunks_count + 2, res @pytest.mark.p2 def test_add_chunk_to_deleted_document(self, WebApiAuth, add_document): - kb_id, doc_id = add_document - delete_document(WebApiAuth, kb_id, {"ids": [doc_id]}) - res = add_chunk(WebApiAuth, {"doc_id": doc_id, "content_with_weight": "chunk test"}) + dataset_id, document_id = add_document + delete_document(WebApiAuth, dataset_id, {"ids": [document_id]}) + res = add_chunk(WebApiAuth, dataset_id, document_id, {"content": "chunk test"}) assert res["code"] == 102, res - assert res["message"] == "Document not found!", res + assert res["message"] == f"You don't own the document {document_id}.", res @pytest.mark.skip(reason="issues/6411") @pytest.mark.p3 def test_concurrent_add_chunk(self, WebApiAuth, add_document): count = 50 - _, doc_id = add_document - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] == 0: - chunks_count = res["data"]["doc"]["chunk_num"] - else: - chunks_count = 0 + dataset_id, document_id = add_document + chunks_count = list_chunks(WebApiAuth, dataset_id, document_id)["data"]["doc"]["chunk_count"] with ThreadPoolExecutor(max_workers=5) as executor: futures = [ - executor.submit( - add_chunk, - WebApiAuth, - {"doc_id": doc_id, "content_with_weight": f"chunk test {i}"}, - ) + executor.submit(add_chunk, WebApiAuth, dataset_id, document_id, {"content": f"chunk test {i}"}) for i in range(count) ] responses = list(as_completed(futures)) assert len(responses) == count, responses assert all(future.result()["code"] == 0 for future in futures) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert res["data"]["doc"]["chunk_num"] == chunks_count + count + res = list_chunks(WebApiAuth, dataset_id, document_id) + assert res["data"]["doc"]["chunk_count"] == chunks_count + count diff --git a/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py b/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py index 75b6082a5..1b381499f 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py +++ b/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py @@ -17,9 +17,9 @@ import os from concurrent.futures import ThreadPoolExecutor, as_completed import pytest -from test_common import batch_add_chunks, list_chunks, update_chunk from configs import INVALID_API_TOKEN from libs.auth import RAGFlowWebApiAuth +from test_common import batch_add_chunks, list_chunks, update_chunk @pytest.mark.p2 @@ -32,7 +32,7 @@ class TestAuthorization: ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): - res = list_chunks(invalid_auth, {"doc_id": "document_id"}) + res = list_chunks(invalid_auth, "dataset_id", "document_id") assert res["code"] == expected_code, res assert res["message"] == expected_message, res @@ -42,21 +42,18 @@ class TestChunksList: @pytest.mark.parametrize( "params, expected_code, expected_page_size, expected_message", [ - pytest.param({"page": None, "size": 2}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")""", marks=pytest.mark.skip), - pytest.param({"page": 0, "size": 2}, 100, 0, "ValueError('Search does not support negative slicing.')", marks=pytest.mark.skip), - ({"page": 2, "size": 2}, 0, 2, ""), - ({"page": 3, "size": 2}, 0, 1, ""), - ({"page": "3", "size": 2}, 0, 1, ""), - pytest.param({"page": -1, "size": 2}, 100, 0, "ValueError('Search does not support negative slicing.')", marks=pytest.mark.skip), - pytest.param({"page": "a", "size": 2}, 100, 0, """ValueError("invalid literal for int() with base 10: \'a\'")""", marks=pytest.mark.skip), + ({"page": None, "page_size": 2}, 0, 2, ""), + pytest.param({"page": 0, "page_size": 2}, 100, 0, "ValueError('Search does not support negative slicing.')", marks=pytest.mark.skip), + ({"page": 2, "page_size": 2}, 0, 2, ""), + ({"page": 3, "page_size": 2}, 0, 1, ""), + ({"page": "3", "page_size": 2}, 0, 1, ""), + pytest.param({"page": -1, "page_size": 2}, 100, 0, "ValueError('Search does not support negative slicing.')", marks=pytest.mark.skip), + pytest.param({"page": "a", "page_size": 2}, 100, 0, """ValueError("invalid literal for int() with base 10: 'a'")""", marks=pytest.mark.skip), ], ) def test_page(self, WebApiAuth, add_chunks, params, expected_code, expected_page_size, expected_message): - _, doc_id, _ = add_chunks - payload = {"doc_id": doc_id} - if params: - payload.update(params) - res = list_chunks(WebApiAuth, payload) + dataset_id, document_id, _ = add_chunks + res = list_chunks(WebApiAuth, dataset_id, document_id, params=params) assert res["code"] == expected_code, res if expected_code == 0: assert len(res["data"]["chunks"]) == expected_page_size, res @@ -67,21 +64,18 @@ class TestChunksList: @pytest.mark.parametrize( "params, expected_code, expected_page_size, expected_message", [ - ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), - pytest.param({"size": 0}, 0, 5, ""), - ({"size": 1}, 0, 1, ""), - ({"size": 6}, 0, 5, ""), - ({"size": "1"}, 0, 1, ""), - pytest.param({"size": -1}, 0, 5, "", marks=pytest.mark.skip), - pytest.param({"size": "a"}, 100, 0, """ValueError("invalid literal for int() with base 10: \'a\'")""", marks=pytest.mark.skip), + ({"page_size": None}, 0, 5, ""), + pytest.param({"page_size": 0}, 0, 5, ""), + ({"page_size": 1}, 0, 1, ""), + ({"page_size": 6}, 0, 5, ""), + ({"page_size": "1"}, 0, 1, ""), + pytest.param({"page_size": -1}, 0, 5, "", marks=pytest.mark.skip), + pytest.param({"page_size": "a"}, 100, 0, """ValueError("invalid literal for int() with base 10: 'a'")""", marks=pytest.mark.skip), ], ) def test_page_size(self, WebApiAuth, add_chunks, params, expected_code, expected_page_size, expected_message): - _, doc_id, _ = add_chunks - payload = {"doc_id": doc_id} - if params: - payload.update(params) - res = list_chunks(WebApiAuth, payload) + dataset_id, document_id, _ = add_chunks + res = list_chunks(WebApiAuth, dataset_id, document_id, params=params) assert res["code"] == expected_code, res if expected_code == 0: assert len(res["data"]["chunks"]) == expected_page_size, res @@ -89,29 +83,22 @@ class TestChunksList: assert res["message"] == expected_message, res @pytest.mark.p2 - def test_available_int_filter(self, WebApiAuth, add_chunks): - _, doc_id, chunk_ids = add_chunks + def test_available_filter(self, WebApiAuth, add_chunks): + dataset_id, document_id, chunk_ids = add_chunks chunk_id = chunk_ids[0] - res = update_chunk( - WebApiAuth, - {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "unchanged content", "available_int": 0}, - ) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, {"content": "unchanged content", "available": False}) assert res["code"] == 0, res from time import sleep sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id, "available_int": 0}) + res = list_chunks(WebApiAuth, dataset_id, document_id, params={"available": "false"}) assert res["code"] == 0, res assert len(res["data"]["chunks"]) >= 1, res - assert all(chunk["available_int"] == 0 for chunk in res["data"]["chunks"]), res + assert all(chunk["available"] is False for chunk in res["data"]["chunks"]), res - # Restore the class-scoped fixture state for subsequent keyword cases. - res = update_chunk( - WebApiAuth, - {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "chunk test 0", "available_int": 1}, - ) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, {"content": "chunk test 0", "available": True}) assert res["code"] == 0, res sleep(1) @@ -123,49 +110,44 @@ class TestChunksList: ({"keywords": ""}, 5), ({"keywords": "1"}, 1), pytest.param({"keywords": "chunk"}, 4, marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6509")), - ({"keywords": "content"}, 1), ({"keywords": "unknown"}, 0), ], ) def test_keywords(self, WebApiAuth, add_chunks, params, expected_page_size): - _, doc_id, _ = add_chunks - payload = {"doc_id": doc_id} - if params: - payload.update(params) - res = list_chunks(WebApiAuth, payload) + dataset_id, document_id, _ = add_chunks + res = list_chunks(WebApiAuth, dataset_id, document_id, params=params) assert res["code"] == 0, res assert len(res["data"]["chunks"]) == expected_page_size, res @pytest.mark.p3 def test_invalid_params(self, WebApiAuth, add_chunks): - _, doc_id, _ = add_chunks - payload = {"doc_id": doc_id, "a": "b"} - res = list_chunks(WebApiAuth, payload) + dataset_id, document_id, _ = add_chunks + res = list_chunks(WebApiAuth, dataset_id, document_id, params={"a": "b"}) assert res["code"] == 0, res assert len(res["data"]["chunks"]) == 5, res @pytest.mark.p3 def test_concurrent_list(self, WebApiAuth, add_chunks): - _, doc_id, _ = add_chunks + dataset_id, document_id, _ = add_chunks count = 100 with ThreadPoolExecutor(max_workers=5) as executor: - futures = [executor.submit(list_chunks, WebApiAuth, {"doc_id": doc_id}) for i in range(count)] + futures = [executor.submit(list_chunks, WebApiAuth, dataset_id, document_id) for _ in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses assert all(len(future.result()["data"]["chunks"]) == 5 for future in futures) @pytest.mark.p1 def test_default(self, WebApiAuth, add_document): - _, doc_id = add_document + dataset_id, document_id = add_document + + res = list_chunks(WebApiAuth, dataset_id, document_id) + chunks_count = res["data"]["doc"]["chunk_count"] + batch_add_chunks(WebApiAuth, dataset_id, document_id, 31) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - chunks_count = res["data"]["doc"]["chunk_num"] - batch_add_chunks(WebApiAuth, doc_id, 31) - # issues/6487 from time import sleep sleep(3) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) + res = list_chunks(WebApiAuth, dataset_id, document_id) assert res["code"] == 0 assert len(res["data"]["chunks"]) == 30 - assert res["data"]["doc"]["chunk_num"] == chunks_count + 31 + assert res["data"]["doc"]["chunk_count"] == chunks_count + 31 diff --git a/test/testcases/test_web_api/test_chunk_app/test_rm_chunks.py b/test/testcases/test_web_api/test_chunk_app/test_rm_chunks.py index 45be9a732..6979ef041 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_rm_chunks.py +++ b/test/testcases/test_web_api/test_chunk_app/test_rm_chunks.py @@ -16,9 +16,9 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest -from test_common import batch_add_chunks, delete_chunks, list_chunks from configs import INVALID_API_TOKEN from libs.auth import RAGFlowWebApiAuth +from test_common import batch_add_chunks, delete_chunks, list_chunks @pytest.mark.p2 @@ -31,7 +31,7 @@ class TestAuthorization: ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): - res = delete_chunks(invalid_auth, {"doc_id": "document_id", "chunk_ids": ["1"]}) + res = delete_chunks(invalid_auth, "dataset_id", "document_id", {"chunk_ids": ["1"]}) assert res["code"] == expected_code assert res["message"] == expected_message @@ -39,17 +39,16 @@ class TestAuthorization: class TestChunksDeletion: @pytest.mark.p3 @pytest.mark.parametrize( - "doc_id, expected_code, expected_message", + "document_id, expected_code, expected_message", [ - ("", 102, "Document not found!"), - ("invalid_document_id", 102, "Document not found!"), + ("invalid_document_id", 100, "Can't find the document with ID invalid_document_id!"), ], ) - def test_invalid_document_id(self, WebApiAuth, add_chunks_func, doc_id, expected_code, expected_message): - _, _, chunk_ids = add_chunks_func - res = delete_chunks(WebApiAuth, {"doc_id": doc_id, "chunk_ids": chunk_ids}) + def test_invalid_document_id(self, WebApiAuth, add_chunks_func, document_id, expected_code, expected_message): + dataset_id, _, chunk_ids = add_chunks_func + res = delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids}) assert res["code"] == expected_code, res - assert res["message"] == expected_message, res + assert expected_message in res["message"], res @pytest.mark.parametrize( "payload", @@ -60,61 +59,41 @@ class TestChunksDeletion: ], ) def test_delete_partial_invalid_id(self, WebApiAuth, add_chunks_func, payload): - _, doc_id, chunk_ids = add_chunks_func - if callable(payload): - payload = payload(chunk_ids) - payload["doc_id"] = doc_id - res = delete_chunks(WebApiAuth, payload) - assert res["code"] == 0, res - - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert len(res["data"]["chunks"]) == 0, res - assert res["data"]["total"] == 0, res + dataset_id, document_id, chunk_ids = add_chunks_func + payload = payload(chunk_ids) + res = delete_chunks(WebApiAuth, dataset_id, document_id, payload) + assert res["code"] == 102, res + assert "rm_chunk deleted chunks" in res["message"], res @pytest.mark.p3 def test_repeated_deletion(self, WebApiAuth, add_chunks_func): - _, doc_id, chunk_ids = add_chunks_func - payload = {"chunk_ids": chunk_ids, "doc_id": doc_id} - res = delete_chunks(WebApiAuth, payload) + dataset_id, document_id, chunk_ids = add_chunks_func + payload = {"chunk_ids": chunk_ids} + res = delete_chunks(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == 0, res - res = delete_chunks(WebApiAuth, payload) + res = delete_chunks(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == 102, res - assert res["message"] == "Index updating failure", res + assert res["message"] == f"rm_chunk deleted chunks 0, expect {len(chunk_ids)}", res @pytest.mark.p3 def test_duplicate_deletion(self, WebApiAuth, add_chunks_func): - _, doc_id, chunk_ids = add_chunks_func - payload = {"chunk_ids": chunk_ids * 2, "doc_id": doc_id} - res = delete_chunks(WebApiAuth, payload) + dataset_id, document_id, chunk_ids = add_chunks_func + res = delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids * 2}) assert res["code"] == 0, res - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) + res = list_chunks(WebApiAuth, dataset_id, document_id) assert res["code"] == 0, res assert len(res["data"]["chunks"]) == 0, res assert res["data"]["total"] == 0, res - @pytest.mark.p2 - def test_delete_scalar_chunk_id_payload(self, WebApiAuth, add_chunks_func): - _, doc_id, chunk_ids = add_chunks_func - payload = {"chunk_ids": chunk_ids[0], "doc_id": doc_id} - res = delete_chunks(WebApiAuth, payload) - assert res["code"] == 0, res - - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - assert len(res["data"]["chunks"]) == 3, res - assert res["data"]["total"] == 3, res - @pytest.mark.p2 def test_delete_duplicate_ids_dedup_behavior(self, WebApiAuth, add_chunks_func): - _, doc_id, chunk_ids = add_chunks_func - payload = {"chunk_ids": [chunk_ids[0], chunk_ids[0]], "doc_id": doc_id} - res = delete_chunks(WebApiAuth, payload) + dataset_id, document_id, chunk_ids = add_chunks_func + res = delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": [chunk_ids[0], chunk_ids[0]]}) assert res["code"] == 0, res - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) + res = list_chunks(WebApiAuth, dataset_id, document_id) assert res["code"] == 0, res assert len(res["data"]["chunks"]) == 3, res assert res["data"]["total"] == 3, res @@ -122,16 +101,12 @@ class TestChunksDeletion: @pytest.mark.p3 def test_concurrent_deletion(self, WebApiAuth, add_document): count = 100 - _, doc_id = add_document - chunk_ids = batch_add_chunks(WebApiAuth, doc_id, count) + dataset_id, document_id = add_document + chunk_ids = batch_add_chunks(WebApiAuth, dataset_id, document_id, count) with ThreadPoolExecutor(max_workers=5) as executor: futures = [ - executor.submit( - delete_chunks, - WebApiAuth, - {"doc_id": doc_id, "chunk_ids": chunk_ids[i : i + 1]}, - ) + executor.submit(delete_chunks, WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids[i : i + 1]}) for i in range(count) ] responses = list(as_completed(futures)) @@ -141,45 +116,40 @@ class TestChunksDeletion: @pytest.mark.p3 def test_delete_1k(self, WebApiAuth, add_document): chunks_num = 1_000 - _, doc_id = add_document - chunk_ids = batch_add_chunks(WebApiAuth, doc_id, chunks_num) + dataset_id, document_id = add_document + chunk_ids = batch_add_chunks(WebApiAuth, dataset_id, document_id, chunks_num) from time import sleep sleep(1) - res = delete_chunks(WebApiAuth, {"doc_id": doc_id, "chunk_ids": chunk_ids}) + res = delete_chunks(WebApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids}) assert res["code"] == 0 - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] != 0: - assert False, res + res = list_chunks(WebApiAuth, dataset_id, document_id) + assert res["code"] == 0, res assert len(res["data"]["chunks"]) == 0, res assert res["data"]["total"] == 0, res @pytest.mark.parametrize( "payload, expected_code, expected_message, remaining", [ - pytest.param(None, 100, """TypeError("argument of type \'NoneType\' is not iterable")""", 5, marks=pytest.mark.skip), - pytest.param({"chunk_ids": ["invalid_id"]}, 102, "Index updating failure", 4, marks=pytest.mark.p3), - pytest.param("not json", 100, """UnboundLocalError("local variable \'duplicate_messages\' referenced before assignment")""", 5, marks=pytest.mark.skip(reason="pull/6376")), + pytest.param({"chunk_ids": ["invalid_id"]}, 102, "rm_chunk deleted chunks 0, expect 1", 4, marks=pytest.mark.p3), pytest.param(lambda r: {"chunk_ids": r[:1]}, 0, "", 3, marks=pytest.mark.p3), pytest.param(lambda r: {"chunk_ids": r}, 0, "", 0, marks=pytest.mark.p1), pytest.param({"chunk_ids": []}, 0, "", 4, marks=pytest.mark.p3), ], ) def test_basic_scenarios(self, WebApiAuth, add_chunks_func, payload, expected_code, expected_message, remaining): - _, doc_id, chunk_ids = add_chunks_func + dataset_id, document_id, chunk_ids = add_chunks_func if callable(payload): payload = payload(chunk_ids) - payload["doc_id"] = doc_id - res = delete_chunks(WebApiAuth, payload) + res = delete_chunks(WebApiAuth, dataset_id, document_id, payload) assert res["code"] == expected_code, res if res["code"] != 0: assert res["message"] == expected_message, res - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - if res["code"] != 0: - assert False, res + res = list_chunks(WebApiAuth, dataset_id, document_id) + assert res["code"] == 0, res assert len(res["data"]["chunks"]) == remaining, res assert res["data"]["total"] == remaining, res diff --git a/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py b/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py index 84df26dc2..e94fc9b18 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py +++ b/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py @@ -13,16 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import base64 import os from concurrent.futures import ThreadPoolExecutor, as_completed from random import randint from time import sleep import pytest -from test_common import delete_document, list_chunks, update_chunk from configs import INVALID_API_TOKEN from libs.auth import RAGFlowWebApiAuth +from test_common import delete_document, list_chunks, update_chunk @pytest.mark.p2 @@ -35,178 +34,144 @@ class TestAuthorization: ], ) def test_invalid_auth(self, invalid_auth, expected_code, expected_message): - res = update_chunk(invalid_auth, {"doc_id": "doc_id", "chunk_id": "chunk_id", "content_with_weight": "test"}) + res = update_chunk(invalid_auth, "dataset_id", "document_id", "chunk_id", {"content": "test"}) assert res["code"] == expected_code, res assert res["message"] == expected_message, res +def _find_chunk(auth, dataset_id, document_id, chunk_id): + res = list_chunks(auth, dataset_id, document_id, params={"id": chunk_id}) + assert res["code"] == 0, res + return res["data"]["chunks"][0] + + class TestUpdateChunk: @pytest.mark.p1 @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"content_with_weight": None}, 100, "TypeError('expected string or bytes-like object')"), - ({"content_with_weight": ""}, 102, "`content_with_weight` is required"), - ({"content_with_weight": 1}, 100, "TypeError('expected string or bytes-like object')"), - ({"content_with_weight": "update chunk"}, 0, ""), - ({"content_with_weight": " "}, 102, "`content_with_weight` is required"), - ({"content_with_weight": "\n!?。;!?\"'"}, 0, ""), + ({"content": None}, 0, ""), + ({"content": ""}, 102, "`content` is required"), + pytest.param({"content": 1}, 100, "TypeError('expected string or bytes-like object')", marks=pytest.mark.skip), + ({"content": "update chunk"}, 0, ""), + ({"content": " "}, 102, "`content` is required"), + ({"content": "\n!?。;!?\"'"}, 0, ""), ], ) def test_content(self, WebApiAuth, add_chunks, payload, expected_code, expected_message): - _, doc_id, chunk_ids = add_chunks + dataset_id, document_id, chunk_ids = add_chunks chunk_id = chunk_ids[0] - update_payload = {"doc_id": doc_id, "chunk_id": chunk_id} - if payload: - update_payload.update(payload) - res = update_chunk(WebApiAuth, update_payload) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, payload) assert res["code"] == expected_code, res if expected_code != 0: assert res["message"] == expected_message, res else: sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - for chunk in res["data"]["chunks"]: - if chunk["chunk_id"] == chunk_id: - assert chunk["content_with_weight"] == payload["content_with_weight"] + chunk = _find_chunk(WebApiAuth, dataset_id, document_id, chunk_id) + if payload["content"] is not None: + assert chunk["content"] == payload["content"] @pytest.mark.p2 @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"important_kwd": ["a", "b", "c"]}, 0, ""), - ({"important_kwd": [""]}, 0, ""), - ({"important_kwd": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), - ({"important_kwd": ["a", "a"]}, 0, ""), - ({"important_kwd": "abc"}, 102, "`important_kwd` should be a list"), - ({"important_kwd": 123}, 102, "`important_kwd` should be a list"), + ({"important_keywords": ["a", "b", "c"]}, 0, ""), + ({"important_keywords": [""]}, 0, ""), + ({"important_keywords": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), + ({"important_keywords": ["a", "a"]}, 0, ""), + ({"important_keywords": "abc"}, 102, "`important_keywords` should be a list"), + ({"important_keywords": 123}, 102, "`important_keywords` should be a list"), ], ) def test_important_keywords(self, WebApiAuth, add_chunks, payload, expected_code, expected_message): - _, doc_id, chunk_ids = add_chunks + dataset_id, document_id, chunk_ids = add_chunks chunk_id = chunk_ids[0] - update_payload = {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "unchanged content"} # Add content_with_weight as it's required - if payload: - update_payload.update(payload) - res = update_chunk(WebApiAuth, update_payload) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, payload) assert res["code"] == expected_code, res if expected_code != 0: assert res["message"] == expected_message, res else: sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - for chunk in res["data"]["chunks"]: - if chunk["chunk_id"] == chunk_id: - assert chunk["important_kwd"] == payload["important_kwd"] + chunk = _find_chunk(WebApiAuth, dataset_id, document_id, chunk_id) + assert chunk["important_keywords"] == payload["important_keywords"] @pytest.mark.p2 @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"question_kwd": ["a", "b", "c"]}, 0, ""), - ({"question_kwd": [""]}, 100, """Exception('Error: 413 - {"error":"Input validation error: `inputs` cannot be empty","error_type":"Validation"}')"""), - ({"question_kwd": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), - ({"question_kwd": ["a", "a"]}, 0, ""), - ({"question_kwd": "abc"}, 102, "`question_kwd` should be a list"), - ({"question_kwd": 123}, 102, "`question_kwd` should be a list"), + ({"questions": ["a", "b", "c"]}, 0, ""), + ({"questions": [""]}, 0, ""), + ({"questions": [1]}, 100, "TypeError('sequence item 0: expected str instance, int found')"), + ({"questions": ["a", "a"]}, 0, ""), + ({"questions": "abc"}, 102, "`questions` should be a list"), + ({"questions": 123}, 102, "`questions` should be a list"), ], ) def test_questions(self, WebApiAuth, add_chunks, payload, expected_code, expected_message): - _, doc_id, chunk_ids = add_chunks + dataset_id, document_id, chunk_ids = add_chunks chunk_id = chunk_ids[0] - update_payload = {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "unchanged content"} # Add content_with_weight as it's required - if payload: - update_payload.update(payload) - - res = update_chunk(WebApiAuth, update_payload) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, payload) assert res["code"] == expected_code, res if expected_code != 0: assert res["message"] == expected_message, res else: sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - for chunk in res["data"]["chunks"]: - if chunk["chunk_id"] == chunk_id: - assert chunk["question_kwd"] == payload["question_kwd"] + chunk = _find_chunk(WebApiAuth, dataset_id, document_id, chunk_id) + assert chunk["questions"] == [str(q).strip() for q in payload["questions"] if str(q).strip()] @pytest.mark.p2 @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"available_int": 1}, 0, ""), - ({"available_int": 0}, 0, ""), + ({"available": True}, 0, ""), + ({"available": 1}, 0, ""), + ({"available": False}, 0, ""), + ({"available": 0}, 0, ""), ], ) def test_available(self, WebApiAuth, add_chunks, payload, expected_code, expected_message): - _, doc_id, chunk_ids = add_chunks + dataset_id, document_id, chunk_ids = add_chunks chunk_id = chunk_ids[0] - update_payload = {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "unchanged content"} - if payload: - update_payload.update(payload) - - res = update_chunk(WebApiAuth, update_payload) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_id, payload) assert res["code"] == expected_code, res if expected_code != 0: assert res["message"] == expected_message, res else: sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - for chunk in res["data"]["chunks"]: - if chunk["chunk_id"] == chunk_id: - assert chunk["available_int"] == payload["available_int"] + chunk = _find_chunk(WebApiAuth, dataset_id, document_id, chunk_id) + assert chunk["available"] == bool(payload["available"]) @pytest.mark.p2 def test_update_chunk_qa_multiline_content(self, WebApiAuth, add_chunks): - _, doc_id, chunk_ids = add_chunks - payload = {"doc_id": doc_id, "chunk_id": chunk_ids[0], "content_with_weight": "Question line\nAnswer line"} - res = update_chunk(WebApiAuth, payload) + dataset_id, document_id, chunk_ids = add_chunks + payload = {"content": "Question line\nAnswer line"} + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], payload) assert res["code"] == 0, res sleep(1) - res = list_chunks(WebApiAuth, {"doc_id": doc_id}) - assert res["code"] == 0, res - chunk = next(chunk for chunk in res["data"]["chunks"] if chunk["chunk_id"] == chunk_ids[0]) - assert chunk["content_with_weight"] == payload["content_with_weight"], res - - @pytest.mark.p2 - def test_update_chunk_with_image_payload(self, WebApiAuth, add_chunks): - _, doc_id, chunk_ids = add_chunks - payload = { - "doc_id": doc_id, - "chunk_id": chunk_ids[0], - "content_with_weight": "content with image", - "image_base64": base64.b64encode(b"img").decode("utf-8"), - "img_id": "bucket-name", - } - res = update_chunk(WebApiAuth, payload) - assert res["code"] == 0, res + chunk = _find_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0]) + assert chunk["content"] == payload["content"], chunk @pytest.mark.p3 @pytest.mark.parametrize( - "doc_id_param, expected_code, expected_message", + "document_id, expected_code, expected_message", [ - ("", 102, "Tenant not found!"), - ("invalid_doc_id", 102, "Tenant not found!"), + ("invalid_doc_id", 102, "You don't own the document invalid_doc_id."), ], ) - def test_invalid_document_id_for_update(self, WebApiAuth, add_chunks, doc_id_param, expected_code, expected_message): - _, _, chunk_ids = add_chunks - chunk_id = chunk_ids[0] - - payload = {"doc_id": doc_id_param, "chunk_id": chunk_id, "content_with_weight": "test content"} - res = update_chunk(WebApiAuth, payload) + def test_invalid_document_id_for_update(self, WebApiAuth, add_chunks, document_id, expected_code, expected_message): + dataset_id, _, chunk_ids = add_chunks + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], {"content": "test content"}) assert res["code"] == expected_code assert expected_message in res["message"] @pytest.mark.p3 def test_repeated_update_chunk(self, WebApiAuth, add_chunks): - _, doc_id, chunk_ids = add_chunks - payload1 = {"doc_id": doc_id, "chunk_id": chunk_ids[0], "content_with_weight": "chunk test 1"} - res = update_chunk(WebApiAuth, payload1) + dataset_id, document_id, chunk_ids = add_chunks + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], {"content": "chunk test 1"}) assert res["code"] == 0 - payload2 = {"doc_id": doc_id, "chunk_id": chunk_ids[0], "content_with_weight": "chunk test 2"} - res = update_chunk(WebApiAuth, payload2) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], {"content": "chunk test 2"}) assert res["code"] == 0 @pytest.mark.p3 @@ -215,17 +180,11 @@ class TestUpdateChunk: [ ({"unknown_key": "unknown_value"}, 0, ""), ({}, 0, ""), - pytest.param(None, 100, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")""", marks=pytest.mark.skip), ], ) def test_invalid_params(self, WebApiAuth, add_chunks, payload, expected_code, expected_message): - _, doc_id, chunk_ids = add_chunks - chunk_id = chunk_ids[0] - update_payload = {"doc_id": doc_id, "chunk_id": chunk_id, "content_with_weight": "unchanged content"} - if payload is not None: - update_payload.update(payload) - - res = update_chunk(WebApiAuth, update_payload) + dataset_id, document_id, chunk_ids = add_chunks + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], payload) assert res["code"] == expected_code, res if expected_code != 0: assert res["message"] == expected_message, res @@ -234,14 +193,17 @@ class TestUpdateChunk: @pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6554") def test_concurrent_update_chunk(self, WebApiAuth, add_chunks): count = 50 - _, doc_id, chunk_ids = add_chunks + dataset_id, document_id, chunk_ids = add_chunks with ThreadPoolExecutor(max_workers=5) as executor: futures = [ executor.submit( update_chunk, WebApiAuth, - {"doc_id": doc_id, "chunk_id": chunk_ids[randint(0, 3)], "content_with_weight": f"update chunk test {i}"}, + dataset_id, + document_id, + chunk_ids[randint(0, 3)], + {"content": f"update chunk test {i}"}, ) for i in range(count) ] @@ -251,9 +213,8 @@ class TestUpdateChunk: @pytest.mark.p3 def test_update_chunk_to_deleted_document(self, WebApiAuth, add_chunks): - kb_id, doc_id, chunk_ids = add_chunks - delete_document(WebApiAuth, kb_id, {"ids": [doc_id]}) - payload = {"doc_id": doc_id, "chunk_id": chunk_ids[0], "content_with_weight": "test content"} - res = update_chunk(WebApiAuth, payload) + dataset_id, document_id, chunk_ids = add_chunks + delete_document(WebApiAuth, dataset_id, {"ids": [document_id]}) + res = update_chunk(WebApiAuth, dataset_id, document_id, chunk_ids[0], {"content": "test content"}) assert res["code"] == 102, res - assert res["message"] == "Tenant not found!", res + assert res["message"] in [f"You don't own the document {document_id}.", f"Can't find this chunk {chunk_ids[0]}"] diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index ab5ce042d..d81d3736e 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -28,7 +28,8 @@ HEADERS = {"Content-Type": "application/json"} KB_APP_URL = f"/{VERSION}/kb" DATASETS_URL = f"/api/{VERSION}/datasets" DOCUMENT_APP_URL = f"/{VERSION}/document" -CHUNK_API_URL = f"/{VERSION}/chunk" +CHUNK_APP_URL = f"/{VERSION}/chunk" +CHUNK_API_URL = f"/api/{VERSION}/datasets/{{dataset_id}}/documents/{{document_id}}/chunks" # SESSION_WITH_CHAT_ASSISTANT_API_URL = "/api/v1/chats/{chat_id}/sessions" # SESSION_WITH_AGENT_API_URL = "/api/v1/agents/{agent_id}/sessions" MEMORY_API_URL = f"/api/{VERSION}/memories" @@ -441,47 +442,53 @@ def bulk_upload_documents(auth, kb_id, num, tmp_path): return document_ids -# CHUNK APP -def add_chunk(auth, payload=None, *, headers=HEADERS, data=None): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/create", headers=headers, auth=auth, json=payload, data=data) +# CHUNK MANAGEMENT +def add_chunk(auth, dataset_id, document_id, payload=None, *, headers=HEADERS, data=None): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.post(url=url, headers=headers, auth=auth, json=payload, data=data) return res.json() -def list_chunks(auth, payload=None, *, headers=HEADERS): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/list", headers=headers, auth=auth, json=payload) +def list_chunks(auth, dataset_id, document_id, params=None, *, headers=HEADERS): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.get(url=url, headers=headers, auth=auth, params=params) return res.json() -def get_chunk(auth, params=None, *, headers=HEADERS): - res = requests.get(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/get", headers=headers, auth=auth, params=params) +def get_chunk(auth, dataset_id, document_id, chunk_id, *, headers=HEADERS): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}/{chunk_id}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.get(url=url, headers=headers, auth=auth) return res.json() -def update_chunk(auth, payload=None, *, headers=HEADERS): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/set", headers=headers, auth=auth, json=payload) +def update_chunk(auth, dataset_id, document_id, chunk_id, payload=None, *, headers=HEADERS): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}/{chunk_id}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.patch(url=url, headers=headers, auth=auth, json=payload) return res.json() -def switch_chunks(auth, payload=None, *, headers=HEADERS): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/switch", headers=headers, auth=auth, json=payload) +def switch_chunks(auth, dataset_id, document_id, payload=None, *, headers=HEADERS): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.patch(url=url, headers=headers, auth=auth, json=payload) return res.json() -def delete_chunks(auth, payload=None, *, headers=HEADERS): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/rm", headers=headers, auth=auth, json=payload) +def delete_chunks(auth, dataset_id, document_id, payload=None, *, headers=HEADERS): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.delete(url=url, headers=headers, auth=auth, json=payload) return res.json() def retrieval_chunks(auth, payload=None, *, headers=HEADERS): - res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_API_URL}/retrieval_test", headers=headers, auth=auth, json=payload) + res = requests.post(url=f"{HOST_ADDRESS}{CHUNK_APP_URL}/retrieval_test", headers=headers, auth=auth, json=payload) return res.json() -def batch_add_chunks(auth, doc_id, num): +def batch_add_chunks(auth, dataset_id, document_id, num): chunk_ids = [] for i in range(num): - res = add_chunk(auth, {"doc_id": doc_id, "content_with_weight": f"chunk test {i}"}) - chunk_ids.append(res["data"]["chunk_id"]) + res = add_chunk(auth, dataset_id, document_id, {"content": f"chunk test {i}"}) + chunk_ids.append(res["data"]["chunk"]["id"]) return chunk_ids diff --git a/test/testcases/test_web_api/test_kb_app/test_kb_tags_meta.py b/test/testcases/test_web_api/test_kb_app/test_kb_tags_meta.py index 2fbe67f42..aed597e24 100644 --- a/test/testcases/test_web_api/test_kb_app/test_kb_tags_meta.py +++ b/test/testcases/test_web_api/test_kb_app/test_kb_tags_meta.py @@ -60,10 +60,11 @@ def _seed_tag(auth, kb_id, document_id, chunk_id): tag = f"tag_{uuid.uuid4().hex[:8]}" res = update_chunk( auth, + kb_id, + document_id, + chunk_id, { - "doc_id": document_id, - "chunk_id": chunk_id, - "content_with_weight": f"tag seed {tag}", + "content": f"tag seed {tag}", "tag_kwd": [tag], }, ) diff --git a/test/unit_test/common/test_delete_query_construction.py b/test/unit_test/common/test_delete_query_construction.py index eed2a5489..52e24cf80 100644 --- a/test/unit_test/common/test_delete_query_construction.py +++ b/test/unit_test/common/test_delete_query_construction.py @@ -212,18 +212,17 @@ class TestDeleteQueryConstruction: assert len(kw_terms) == 1 -class TestChunkAppDeleteCondition: +class TestChunkApiDeleteCondition: """ - Tests that verify the chunk_app.py rm endpoint passes the correct + Tests that verify the RESTful chunk DELETE endpoint passes the correct condition to docStoreConn.delete. """ def test_rm_endpoint_includes_doc_id_in_condition(self): """ - The /chunk/rm endpoint MUST include doc_id in the condition + The /api/v1/datasets//documents//chunks endpoint + MUST include doc_id in the condition passed to settings.docStoreConn.delete. - - This is the fix applied to api/apps/chunk_app.py """ # Simulate what the rm endpoint should construct req = { @@ -248,7 +247,7 @@ class TestChunkAppDeleteCondition: class TestSDKDocDeleteCondition: """ - Tests that verify the SDK doc.py rm_chunk endpoint constructs + Tests that verify the RESTful chunk delete endpoint constructs the correct deletion condition. """ @@ -261,7 +260,7 @@ class TestSDKDocDeleteCondition: document_id = "doc456" chunk_ids = ["chunk1", "chunk2"] - # The CORRECT condition construction (from sdk/doc.py): + # The CORRECT condition construction (from restful_apis/chunk_api.py): condition = {"doc_id": document_id} if chunk_ids: condition["id"] = chunk_ids diff --git a/web/src/hooks/route-hook.ts b/web/src/hooks/route-hook.ts index 1962e5385..12738bb76 100644 --- a/web/src/hooks/route-hook.ts +++ b/web/src/hooks/route-hook.ts @@ -2,6 +2,7 @@ import { KnowledgeRouteKey, KnowledgeSearchParams, } from '@/constants/knowledge'; +import { Routes } from '@/routes'; import { useCallback } from 'react'; import { useLocation, useNavigate, useSearchParams } from 'react-router'; @@ -27,13 +28,16 @@ export const useThirdPathName = () => { export const useGetKnowledgeSearchParams = () => { const [currentQueryParameters] = useSearchParams(); + const { pathname } = useLocation(); + const isDataflowResultPage = pathname === Routes.DataflowResult; return { type: currentQueryParameters.get(KnowledgeSearchParams.Type) || '', documentId: currentQueryParameters.get(KnowledgeSearchParams.DocumentId) || '', - knowledgeId: - currentQueryParameters.get(KnowledgeSearchParams.KnowledgeId) || '', + knowledgeId: isDataflowResultPage + ? currentQueryParameters.get('knowledgeId') || '' + : currentQueryParameters.get(KnowledgeSearchParams.KnowledgeId) || '', }; }; diff --git a/web/src/hooks/use-chunk-request.ts b/web/src/hooks/use-chunk-request.ts index d5024ef09..ed4050512 100644 --- a/web/src/hooks/use-chunk-request.ts +++ b/web/src/hooks/use-chunk-request.ts @@ -40,6 +40,7 @@ export const useSelectChunkList = () => { export const useDeleteChunk = () => { const queryClient = useQueryClient(); const { setPaginationParams } = useSetPaginationParams(); + const { knowledgeId } = useGetKnowledgeSearchParams(); const { data, isPending: loading, @@ -47,7 +48,10 @@ export const useDeleteChunk = () => { } = useMutation({ mutationKey: ['deleteChunk'], mutationFn: async (params: { chunkIds: string[]; doc_id: string }) => { - const { data } = await kbService.rmChunk(params); + const { data } = await kbService.rmChunk({ + ...params, + kb_id: knowledgeId, + }); if (data.code === 0) { setPaginationParams(1); queryClient.invalidateQueries({ queryKey: ['fetchChunkList'] }); @@ -62,6 +66,7 @@ export const useDeleteChunk = () => { export const useCreateChunk = () => { const { t } = useTranslation(); const queryClient = useQueryClient(); + const { knowledgeId } = useGetKnowledgeSearchParams(); const { data, @@ -74,7 +79,10 @@ export const useCreateChunk = () => { if (payload.chunk_id) { service = kbService.setChunk; } - const { data } = await service(payload); + const { data } = await service({ + ...payload, + kb_id: payload.kb_id || knowledgeId, + }); if (data.code === 0) { message.success(t('message.created')); setTimeout(() => { @@ -88,14 +96,20 @@ export const useCreateChunk = () => { return { data, loading, createChunk: mutateAsync }; }; -export const useFetchChunk = (chunkId?: string): ResponseType => { +export const useFetchChunk = ( + chunkId?: string, + documentId?: string, +): ResponseType => { + const { knowledgeId } = useGetKnowledgeSearchParams(); const { data } = useQuery({ - queryKey: ['fetchChunk'], - enabled: !!chunkId, + queryKey: ['fetchChunk', knowledgeId, documentId, chunkId], + enabled: !!chunkId && !!documentId && !!knowledgeId, initialData: {}, gcTime: 0, queryFn: async () => { const data = await kbService.getChunk({ + kb_id: knowledgeId, + doc_id: documentId, chunk_id: chunkId, }); @@ -115,7 +129,7 @@ export const useFetchNextChunkList = ( }> & IChunkListResult => { const { pagination, setPagination } = useGetPaginationWithRouter(); - const { documentId } = useGetKnowledgeSearchParams(); + const { documentId, knowledgeId } = useGetKnowledgeSearchParams(); const { searchString, handleInputChange } = useHandleSearchChange(); const [available, setAvailable] = useState(); const debouncedSearchString = useDebounce(searchString, { wait: 500 }); @@ -127,6 +141,7 @@ export const useFetchNextChunkList = ( } = useQuery({ queryKey: [ 'fetchChunkList', + knowledgeId, documentId, pagination.current, pagination.pageSize, @@ -136,9 +151,10 @@ export const useFetchNextChunkList = ( placeholderData: (previousData: any) => previousData ?? { data: [], total: 0, documentInfo: {} }, // https://github.com/TanStack/query/issues/8183 gcTime: 0, - enabled, + enabled: enabled && !!knowledgeId && !!documentId, queryFn: async () => { const { data } = await kbService.chunkList({ + kb_id: knowledgeId, doc_id: documentId, page: pagination.current, size: pagination.pageSize, @@ -195,6 +211,7 @@ export const useFetchNextChunkList = ( export const useSwitchChunk = () => { const { t } = useTranslation(); + const { knowledgeId } = useGetKnowledgeSearchParams(); const { data, isPending: loading, @@ -206,7 +223,10 @@ export const useSwitchChunk = () => { available_int?: number; doc_id: string; }) => { - const { data } = await kbService.switchChunk(params); + const { data } = await kbService.switchChunk({ + ...params, + kb_id: knowledgeId, + }); if (data.code === 0) { message.success(t('message.modified')); } diff --git a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-creating-modal/index.tsx b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-creating-modal/index.tsx index 5b3d65e67..5a36d76b5 100644 --- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-creating-modal/index.tsx +++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-creating-modal/index.tsx @@ -26,7 +26,6 @@ import type { ChunkDocType } from '@/interfaces/database/knowledge'; import React, { useCallback, useEffect, useState } from 'react'; import { FieldValues, FormProvider, useForm } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; -import { useDeleteChunkByIds } from '../../hooks'; import { transformTagFeaturesArrayToObject, transformTagFeaturesObjectToArray, @@ -75,8 +74,7 @@ const ChunkCreatingModal: React.FC & kFProps> = ({ }, }); const [checked, setChecked] = useState(false); - const { removeChunk } = useDeleteChunkByIds(); - const { data } = useFetchChunk(chunkId); + const { data } = useFetchChunk(chunkId, doc_id); const { t } = useTranslation(); const isEditMode = !!chunkId; @@ -99,12 +97,6 @@ const ChunkCreatingModal: React.FC & kFProps> = ({ const handleOk = form.handleSubmit(onSubmit); - const handleRemove = useCallback(() => { - if (chunkId) { - return removeChunk([chunkId], doc_id); - } - }, [chunkId, doc_id, removeChunk]); - const handleCheck = useCallback(() => { setChecked(!checked); }, [checked]); diff --git a/web/src/pages/dataflow-result/components/chunk-creating-modal/index.tsx b/web/src/pages/dataflow-result/components/chunk-creating-modal/index.tsx index 7cad7eec1..e415c2b97 100644 --- a/web/src/pages/dataflow-result/components/chunk-creating-modal/index.tsx +++ b/web/src/pages/dataflow-result/components/chunk-creating-modal/index.tsx @@ -57,7 +57,7 @@ const ChunkCreatingModal: React.FC & kFProps> = ({ }); const [checked, setChecked] = useState(false); const { removeChunk } = useDeleteChunkByIds(); - const { data } = useFetchChunk(chunkId); + const { data } = useFetchChunk(chunkId, doc_id); const { t } = useTranslation(); const isTagParser = parserId === 'tag'; diff --git a/web/src/pages/dataset/dataset-overview/overview-table.tsx b/web/src/pages/dataset/dataset-overview/overview-table.tsx index de92a53ef..0ddf676ed 100644 --- a/web/src/pages/dataset/dataset-overview/overview-table.tsx +++ b/web/src/pages/dataset/dataset-overview/overview-table.tsx @@ -51,7 +51,7 @@ import { DocumentLog, FileLogsTableProps, IFileLogItem } from './interface'; export const getFileLogsTableColumns = ( t: TFunction<'translation', string>, showLog: (row: Row, active: LogTabs) => void, - kowledgeId: string, + knowledgeId: string, navigateToDataflowResult: ( props: NavigateToDataflowResultProps, ) => () => void, @@ -210,7 +210,8 @@ export const getFileLogsTableColumns = ( size="icon-sm" onClick={navigateToDataflowResult({ id: row.original.id, - [PipelineResultSearchParams.KnowledgeId]: kowledgeId, + [PipelineResultSearchParams.KnowledgeId]: + row.original.kb_id || knowledgeId, [PipelineResultSearchParams.DocumentId]: row.original.document_id, [PipelineResultSearchParams.IsReadOnly]: 'false', @@ -358,7 +359,7 @@ const FileLogsTable: FC = ({ const [isModalVisible, setIsModalVisible] = useState(false); const { navigateToDataflowResult } = useNavigatePage(); const [logInfo, setLogInfo] = useState(); - const kowledgeId = useParams().id; + const knowledgeId = useParams().id; const showLog = (row: Row) => { const logDetail = { taskId: row.original?.dsl?.task_id, @@ -382,7 +383,7 @@ const FileLogsTable: FC = ({ ? getFileLogsTableColumns( t, showLog, - kowledgeId || '', + knowledgeId || '', navigateToDataflowResult, dataSourceInfo, ) diff --git a/web/src/services/knowledge-service.ts b/web/src/services/knowledge-service.ts index 9d64e43e8..f1df2e0b2 100644 --- a/web/src/services/knowledge-service.ts +++ b/web/src/services/knowledge-service.ts @@ -21,12 +21,6 @@ const { documentCreate, documentChangeParser, documentThumbnails, - chunkList, - createChunk, - setChunk, - getChunk, - switchChunk, - rmChunk, retrievalTest, documentRun, documentUpload, @@ -97,31 +91,6 @@ const methods = { url: setMeta, method: 'post', }, - // chunk管理 - chunkList: { - url: chunkList, - method: 'post', - }, - createChunk: { - url: createChunk, - method: 'post', - }, - setChunk: { - url: setChunk, - method: 'post', - }, - getChunk: { - url: getChunk, - method: 'get', - }, - switchChunk: { - url: switchChunk, - method: 'post', - }, - rmChunk: { - url: rmChunk, - method: 'post', - }, retrievalTest: { url: retrievalTest, method: 'post', @@ -178,7 +147,139 @@ const methods = { }, }; -const kbService = registerServer(methods, request); +const baseKbService = registerServer(methods, request); + +const getDatasetId = (params: Record) => + params.dataset_id || params.kb_id || params.knowledge_id; + +const getDocumentId = (params: Record) => + params.document_id || params.doc_id; + +const mapChunkToLegacy = (chunk: Record) => ({ + ...chunk, + chunk_id: chunk.chunk_id || chunk.id, + content_with_weight: chunk.content_with_weight || chunk.content, + doc_id: chunk.doc_id || chunk.document_id, + doc_name: chunk.doc_name || chunk.docnm_kwd, + image_id: chunk.image_id || chunk.img_id, + important_kwd: chunk.important_kwd || chunk.important_keywords || [], + question_kwd: chunk.question_kwd || chunk.questions || [], + available_int: chunk.available_int ?? (chunk.available === false ? 0 : 1), + positions: chunk.positions || chunk.position_int || [], +}); + +const mapDocumentToLegacy = (doc: Record) => ({ + ...doc, + chunk_num: doc.chunk_num ?? doc.chunk_count, + kb_id: doc.kb_id || doc.dataset_id, +}); + +const mapChunkPayloadToRest = (payload: Record) => ({ + content: payload.content ?? payload.content_with_weight, + important_keywords: payload.important_keywords ?? payload.important_kwd, + questions: payload.questions ?? payload.question_kwd, + tag_kwd: payload.tag_kwd, + tag_feas: payload.tag_feas, + positions: payload.positions, + available: + payload.available ?? + (payload.available_int === undefined + ? undefined + : payload.available_int === 1), + image_base64: payload.image_base64, +}); + +const getAvailableParam = (available?: number) => { + if (available === undefined) { + return undefined; + } + return available === 1 ? 'true' : 'false'; +}; + +const chunkService = { + chunkList: async (params: Record) => { + const datasetId = getDatasetId(params); + const documentId = getDocumentId(params); + const response = await request.get(api.chunkList(datasetId, documentId), { + params: { + page: params.page, + page_size: params.page_size || params.size, + keywords: params.keywords, + available: getAvailableParam(params.available_int), + }, + }); + + if (response.data?.code === 0) { + response.data.data = { + ...response.data.data, + chunks: (response.data.data?.chunks || []).map(mapChunkToLegacy), + doc: mapDocumentToLegacy(response.data.data?.doc || {}), + }; + } + + return response; + }, + createChunk: async (payload: Record) => { + const datasetId = getDatasetId(payload); + const documentId = getDocumentId(payload); + const response = await request.post(api.chunkList(datasetId, documentId), { + data: mapChunkPayloadToRest(payload), + }); + + if (response.data?.code === 0 && response.data.data?.chunk) { + response.data.data.chunk = mapChunkToLegacy(response.data.data.chunk); + } + + return response; + }, + setChunk: (payload: Record) => { + const datasetId = getDatasetId(payload); + const documentId = getDocumentId(payload); + const chunkId = payload.chunk_id || payload.id; + return request.patch(api.chunkDetail(datasetId, documentId, chunkId), { + data: mapChunkPayloadToRest(payload), + }); + }, + getChunk: async (params: Record) => { + const datasetId = getDatasetId(params); + const documentId = getDocumentId(params); + const chunkId = params.chunk_id || params.id; + const response = await request.get( + api.chunkDetail(datasetId, documentId, chunkId), + ); + + if (response.data?.code === 0) { + response.data.data = mapChunkToLegacy(response.data.data || {}); + } + + return response; + }, + switchChunk: (params: Record) => { + const datasetId = getDatasetId(params); + const documentId = getDocumentId(params); + return request.patch(api.chunkList(datasetId, documentId), { + data: { + chunk_ids: params.chunk_ids || params.chunkIds, + available_int: params.available_int, + }, + }); + }, + rmChunk: (params: Record) => { + const datasetId = getDatasetId(params); + const documentId = getDocumentId(params); + return request.delete(api.chunkList(datasetId, documentId), { + data: { + chunk_ids: params.chunk_ids || params.chunkIds, + delete_all: params.delete_all, + }, + }); + }, +}; + +const kbService = { + ...baseKbService, + ...chunkService, +}; export const listTag = (knowledgeId: string) => request.get(api.listTag(knowledgeId)); diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index d89712cdf..462384f2f 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -99,12 +99,10 @@ export default { renameTag: (knowledgeId: string) => `${webAPI}/kb/${knowledgeId}/rename_tag`, // chunk - chunkList: `${webAPI}/chunk/list`, - createChunk: `${webAPI}/chunk/create`, - setChunk: `${webAPI}/chunk/set`, - getChunk: `${webAPI}/chunk/get`, - switchChunk: `${webAPI}/chunk/switch`, - rmChunk: `${webAPI}/chunk/rm`, + chunkList: (datasetId: string, documentId: string) => + `${restAPIv1}/datasets/${datasetId}/documents/${documentId}/chunks`, + chunkDetail: (datasetId: string, documentId: string, chunkId: string) => + `${restAPIv1}/datasets/${datasetId}/documents/${documentId}/chunks/${chunkId}`, retrievalTest: `${webAPI}/chunk/retrieval_test`, knowledgeGraph: `${webAPI}/chunk/knowledge_graph`,