diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index d52d1e733..5e06d872a 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -155,6 +155,10 @@ async def set(): d["question_kwd"] = req["question_kwd"] d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"])) if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_data_error_result(message="`tag_kwd` should be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_data_error_result(message="`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: d["tag_feas"] = req["tag_feas"] @@ -317,6 +321,12 @@ async def create(): d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"])) d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.datetime.now().timestamp() + if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_data_error_result(message="`tag_kwd` is required to be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_data_error_result(message="`tag_kwd` must be a list of strings") + d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: d["tag_feas"] = req["tag_feas"] image_base64 = req.get("image_base64", None) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 2da815774..77f89de23 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -58,6 +58,7 @@ class Chunk(BaseModel): document_id: str = "" docnm_kwd: str = "" important_keywords: list = Field(default_factory=list) + tag_kwd: list = Field(default_factory=list) questions: list = Field(default_factory=list) question_tks: str = "" image_id: str = "" @@ -1048,6 +1049,11 @@ async def list_chunks(tenant_id, dataset_id, document_id): items: type: string description: Important keywords. + tag_kwd: + type: array + items: + type: string + description: Tag keywords. image_id: type: string description: Image ID associated with the chunk. @@ -1137,6 +1143,7 @@ async def list_chunks(tenant_id, dataset_id, document_id): "document_id": sres.field[id]["doc_id"], "docnm_kwd": sres.field[id]["docnm_kwd"], "important_keywords": sres.field[id].get("important_kwd", []), + "tag_kwd": sres.field[id].get("tag_kwd", []), "questions": sres.field[id].get("question_kwd", []), "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")), "image_id": sres.field[id].get("img_id", ""), @@ -1251,6 +1258,10 @@ async def add_chunk(tenant_id, dataset_id, document_id): d["docnm_kwd"] = doc.name d["doc_id"] = document_id if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_error_data_result("`tag_kwd` is required to be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_error_data_result("`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: d["tag_feas"] = req["tag_feas"] @@ -1283,6 +1294,7 @@ async def add_chunk(tenant_id, dataset_id, document_id): "content_with_weight": "content", "doc_id": "document_id", "important_kwd": "important_keywords", + "tag_kwd": "tag_kwd", "question_kwd": "questions", "kb_id": "dataset_id", "create_timestamp_flt": "create_timestamp", @@ -1432,6 +1444,11 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): items: type: string description: Updated important keywords. + tag_kwd: + type: array + items: + type: string + description: Updated tag keywords. available: type: boolean description: Availability status of the chunk. @@ -1480,6 +1497,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): return get_error_data_result("`positions` should be a list") d["position_int"] = req["positions"] if "tag_kwd" in req: + if not isinstance(req["tag_kwd"], list): + return get_error_data_result("`tag_kwd` should be a list") + if not all(isinstance(t, str) for t in req["tag_kwd"]): + return get_error_data_result("`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: d["tag_feas"] = req["tag_feas"] diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 1573a3cda..df237c255 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -2005,6 +2005,7 @@ Adds a chunk to a specified document in a specified dataset. - Body: - `"content"`: `string` - `"important_keywords"`: `list[string]` + - `"tag_kwd"`: `list[string]` - `"image_base64"`: `string` ##### Request example @@ -2031,6 +2032,8 @@ curl --request POST \ The text content of the chunk. - `"important_keywords`(*Body parameter*), `list[string]` The key terms or phrases to tag with the chunk. +- `"tag_kwd"`: (*Body parameter*), `list[string]` + Tag keywords to associate with the chunk. - `"questions"`(*Body parameter*), `list[string]` If there is a given question, the embedded chunks will be based on them - `"image_base64"`: (*Body parameter*), `string` @@ -2053,6 +2056,7 @@ Success: "id": "12ccdc56e59837e5", "image_id": "", "important_keywords": [], + "tag_kwd": [], "questions": [] } } @@ -2123,6 +2127,7 @@ Success: "id": "b48c170e90f70af998485c1065490726", "image_id": "", "important_keywords": "", + "tag_kwd": [], "positions": [ "" ] @@ -2267,6 +2272,7 @@ Updates content or configurations for a specified chunk. - Body: - `"content"`: `string` - `"important_keywords"`: `list[string]` + - `"tag_kwd"`: `list[string]` - `"available"`: `boolean` ##### Request example @@ -2295,6 +2301,8 @@ curl --request PUT \ The text content of the chunk. - `"important_keywords"`: (*Body parameter*), `list[string]` A list of key terms or phrases to tag with the chunk. +- `"tag_kwd"`: (*Body parameter*), `list[string]` + Updated tag keywords. - `"available"`: (*Body parameter*) `boolean` The chunk's availability status in the dataset. Value options: - `true`: Available (default) @@ -2696,6 +2704,7 @@ Success: "important_keywords": [ "" ], + "tag_kwd": [], "kb_id": "c7ee74067a2c11efb21c0242ac120006", "positions": [ "" diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index cd24cf252..a03c7c7c4 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -855,7 +855,7 @@ print("Async bulk parsing cancelled.") ### Add chunk ```python -Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None) -> Chunk +Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk ``` Adds a chunk to the current document. @@ -874,6 +874,10 @@ The key terms or phrases to tag with the chunk. A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one. +##### tag_kwd: `list[str]` + +Tag keywords to associate with the chunk. + #### Returns - Success: A `Chunk` object. @@ -884,6 +888,7 @@ A `Chunk` object contains the following attributes: - `id`: `str`: The chunk ID. - `content`: `str` The text content of the chunk. - `important_keywords`: `list[str]` A list of key terms or phrases tagged with the chunk. +- `tag_kwd`: `list[str]` A list of tag keywords associated with the chunk. - `image_id`: `str` The image ID associated with the chunk (empty string if no image). - `create_time`: `str` The time when the chunk was created (added to the document). - `create_timestamp`: `float` The timestamp representing the creation time of the chunk, expressed in seconds since January 1, 1970. @@ -1024,6 +1029,7 @@ A dictionary representing the attributes to update, with the following keys: - `"content"`: `str` The text content of the chunk. - `"important_keywords"`: `list[str]` A list of key terms or phrases to tag with the chunk. +- `"tag_kwd"`: `list[str]` A list of tag keywords to associate with the chunk. - `"available"`: `bool` The chunk's availability status in the dataset. Value options: - `False`: Unavailable - `True`: Available (default) diff --git a/internal/service/chunk.go b/internal/service/chunk.go index 1221eeca5..86c87ca33 100644 --- a/internal/service/chunk.go +++ b/internal/service/chunk.go @@ -532,6 +532,9 @@ func buildRetrievalTestResults(filteredChunks []map[string]interface{}) []map[st } else if v, ok := chunk["important_keywords"]; ok { result["important_kwd"] = v } + if v, ok := chunk["tag_kwd"]; ok { + result["tag_kwd"] = v + } if v, ok := chunk["similarity"]; ok { result["similarity"] = v } diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 3cf70b6d9..d45c87d4f 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -477,6 +477,7 @@ class Dealer: "docnm_kwd": dnm, "kb_id": chunk["kb_id"], "important_kwd": chunk.get("important_kwd", []), + "tag_kwd": chunk.get("tag_kwd", []), "image_id": chunk.get("img_id", ""), "similarity": float(sim_np[i]), "vector_similarity": float(vsim[i]), diff --git a/sdk/python/ragflow_sdk/modules/chunk.py b/sdk/python/ragflow_sdk/modules/chunk.py index 609cb2745..6ea9c1a8e 100644 --- a/sdk/python/ragflow_sdk/modules/chunk.py +++ b/sdk/python/ragflow_sdk/modules/chunk.py @@ -28,6 +28,7 @@ class Chunk(Base): self.id = "" self.content = "" self.important_keywords = [] + self.tag_kwd = [] self.questions = [] self.create_time = "" self.create_timestamp = 0.0 diff --git a/sdk/python/ragflow_sdk/modules/document.py b/sdk/python/ragflow_sdk/modules/document.py index e410fa9cb..4aa3dce07 100644 --- a/sdk/python/ragflow_sdk/modules/document.py +++ b/sdk/python/ragflow_sdk/modules/document.py @@ -87,8 +87,8 @@ class Document(Base): return chunks raise Exception(res.get("message")) - def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None): - body = {"content": content, "important_keywords": important_keywords, "questions": questions} + def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None, *, tag_kwd: list[str] = []): + body = {"content": content, "important_keywords": important_keywords, "tag_kwd": tag_kwd, "questions": questions} if image_base64 is not None: body["image_base64"] = image_base64 res = self.post(f"/datasets/{self.dataset_id}/documents/{self.id}/chunks", body) diff --git a/test/testcases/conftest.py b/test/testcases/conftest.py index 158c11f50..16efb00b6 100644 --- a/test/testcases/conftest.py +++ b/test/testcases/conftest.py @@ -165,7 +165,7 @@ def token(auth): response = requests.post(url=url, headers=auth) res = response.json() if res.get("code") != 0: - error_msg = f"access: {url}, POST method, error code: {res.get("code")}, message: {res.get('message')}" + error_msg = f"access: {url}, POST method, error code: {res.get('code')}, message: {res.get('message')}" raise Exception(error_msg) return res["data"].get("token") diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py index c08d44b2a..34ef23920 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py @@ -30,6 +30,8 @@ def validate_chunk_details(dataset_id, document_id, payload, res): assert chunk["important_keywords"] == payload["important_keywords"] if "questions" in payload: assert chunk["questions"] == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()] + if "tag_kwd" in payload: + assert chunk["tag_kwd"] == payload["tag_kwd"] @pytest.mark.p1 @@ -76,7 +78,7 @@ class TestAddChunk: assert False, res chunks_count = res["data"]["doc"]["chunk_count"] res = add_chunk(HttpApiAuth, dataset_id, document_id, payload) - assert res["code"] == expected_code + assert res["code"] == expected_code, res if expected_code == 0: validate_chunk_details(dataset_id, document_id, payload, res) res = list_chunks(HttpApiAuth, dataset_id, document_id) @@ -109,7 +111,9 @@ class TestAddChunk: assert False, res chunks_count = res["data"]["doc"]["chunk_count"] res = add_chunk(HttpApiAuth, dataset_id, document_id, payload) - assert res["code"] == expected_code + assert res["code"] == expected_code, ( + f"Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}" + ) if expected_code == 0: validate_chunk_details(dataset_id, document_id, payload, res) res = list_chunks(HttpApiAuth, dataset_id, document_id) @@ -138,6 +142,35 @@ class TestAddChunk: assert False, res chunks_count = res["data"]["doc"]["chunk_count"] res = add_chunk(HttpApiAuth, dataset_id, document_id, payload) + assert res["code"] == expected_code, res + if expected_code == 0: + validate_chunk_details(dataset_id, document_id, payload, res) + res = list_chunks(HttpApiAuth, dataset_id, document_id) + assert res["data"]["doc"]["chunk_count"] == chunks_count + 1 + else: + assert res["message"] == expected_message + + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_code, expected_message", + [ + ({"content": "chunk test", "tag_kwd": ["tag1", "tag2"]}, 0, ""), + ({"content": "chunk test", "tag_kwd": [""]}, 0, ""), + ({"content": "chunk test", "tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"), + ({"content": "chunk test", "tag_kwd": ["tag", "tag"]}, 0, ""), + ({"content": "chunk test", "tag_kwd": "abc"}, 102, "`tag_kwd` is required to be a list"), + ({"content": "chunk test", "tag_kwd": 123}, 102, "`tag_kwd` is required to be a list"), + ], + ) + def test_tag_kwd(self, HttpApiAuth, add_document, payload, expected_code, expected_message): + dataset_id, document_id = add_document + res = list_chunks(HttpApiAuth, dataset_id, document_id) + if res["code"] != 0: + assert False, res + chunks_count = res["data"]["doc"]["chunk_count"] + res = add_chunk(HttpApiAuth, dataset_id, document_id, payload) + if res["code"] != expected_code: + print(f"\nFAILED! Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}") assert res["code"] == expected_code if expected_code == 0: validate_chunk_details(dataset_id, document_id, payload, res) diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py index 76d73b4bd..96f70a7bc 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -115,6 +115,25 @@ class TestUpdatedChunk: if expected_code != 0: assert res["message"] == expected_message + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_code, expected_message", + [ + ({"tag_kwd": ["tag1", "tag2"]}, 0, ""), + ({"tag_kwd": [""]}, 0, ""), + ({"tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"), + ({"tag_kwd": ["tag", "tag"]}, 0, ""), + ({"tag_kwd": "tag"}, 102, "`tag_kwd` should be a list"), + ({"tag_kwd": 123}, 102, "`tag_kwd` should be a list"), + ], + ) + def test_tag_kwd(self, HttpApiAuth, add_chunks, payload, expected_code, expected_message): + dataset_id, document_id, chunk_ids = add_chunks + res = update_chunk(HttpApiAuth, dataset_id, document_id, chunk_ids[0], payload) + assert res["code"] == expected_code + if expected_code != 0: + assert res["message"] == expected_message + @pytest.mark.p2 @pytest.mark.parametrize( "payload, expected_code, expected_message", diff --git a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py index fb6d17ed2..838cf6f36 100644 --- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py +++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py @@ -28,6 +28,8 @@ def validate_chunk_details(dataset_id: str, document_id: str, payload: dict, chu assert chunk.important_keywords == payload["important_keywords"] if "questions" in payload: assert chunk.questions == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()] + if "tag_kwd" in payload: + assert chunk.tag_kwd == payload["tag_kwd"] class TestAddChunk: @@ -115,6 +117,34 @@ class TestAddChunk: chunks = document.list_chunks() assert len(chunks) == chunks_count + 1, str(chunks) + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"content": "chunk test test_tag_kwd 1", "tag_kwd": ["tag1", "tag2"]}, ""), + ({"content": "chunk test test_tag_kwd 2", "tag_kwd": [""]}, ""), + ({"content": "chunk test test_tag_kwd 3", "tag_kwd": [1]}, "not instance of"), + ({"content": "chunk test test_tag_kwd 4", "tag_kwd": ["tag", "tag"]}, ""), + ({"content": "chunk test test_tag_kwd 5", "tag_kwd": "abc"}, "not instance of"), + ({"content": "chunk test test_tag_kwd 6", "tag_kwd": 123}, "not instance of"), + ], + ) + def test_tag_kwd(self, add_document, payload, expected_message): + dataset, document = add_document + chunks_count = len(document.list_chunks()) + + if expected_message: + with pytest.raises(Exception) as exception_info: + document.add_chunk(**payload) + assert expected_message in str(exception_info.value), str(exception_info.value) + else: + chunk = document.add_chunk(**payload) + validate_chunk_details(dataset.id, document.id, payload, chunk) + + sleep(1) + chunks = document.list_chunks() + assert len(chunks) == chunks_count + 1, str(chunks) + @pytest.mark.p3 def test_repeated_add_chunk(self, add_document): payload = {"content": "chunk test repeated_add_chunk"} diff --git a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py index 93cc3eff7..fda87745c 100644 --- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -102,6 +102,29 @@ class TestUpdatedChunk: else: chunk.update(payload) + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"tag_kwd": ["tag1", "tag2"]}, ""), + ({"tag_kwd": [""]}, ""), + ({"tag_kwd": [1]}, "`tag_kwd` must be a list of strings"), + ({"tag_kwd": ["tag", "tag"]}, ""), + ({"tag_kwd": "tag"}, "`tag_kwd` should be a list"), + ({"tag_kwd": 123}, "`tag_kwd` should be a list"), + ], + ) + def test_tag_kwd(self, add_chunks, payload, expected_message): + _, _, chunks = add_chunks + chunk = chunks[0] + + if expected_message: + with pytest.raises(Exception) as exception_info: + chunk.update(payload) + assert expected_message in str(exception_info.value), str(exception_info.value) + else: + chunk.update(payload) + @pytest.mark.p2 @pytest.mark.parametrize( "payload, expected_message",