diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 5ffc3040e..a35345feb 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -82,6 +82,20 @@ async def update(): return get_data_error_result( message=f"Dataset name length is {len(req['name'])} which is large than {DATASET_NAME_LIMIT}") req["name"] = req["name"].strip() + if settings.DOC_ENGINE_INFINITY: + parser_id = req.get("parser_id") + if isinstance(parser_id, str) and parser_id.lower() == "tag": + return get_json_result( + code=RetCode.OPERATING_ERROR, + message="The chunking method Tag has not been supported by Infinity yet.", + data=False, + ) + if "pagerank" in req: + return get_json_result( + code=RetCode.DATA_ERROR, + message="'pagerank' can only be set when doc_engine is elasticsearch", + data=False, + ) if not KnowledgebaseService.accessible4deletion(req["kb_id"], current_user.id): return get_json_result( diff --git a/common/doc_store/infinity_conn_base.py b/common/doc_store/infinity_conn_base.py index 82650f81d..c8679c31c 100644 --- a/common/doc_store/infinity_conn_base.py +++ b/common/doc_store/infinity_conn_base.py @@ -367,7 +367,10 @@ class InfinityConnectionBase(DocStoreConnection): num_rows = len(res) column_id = res["id"] if field_name not in res: - return {} + if field_name == "content_with_weight" and "content" in res: + field_name = "content" + else: + return {} for i in range(num_rows): id = column_id[i] txt = res[field_name][i] diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index de2dd3a17..94909f8ff 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -9,6 +9,7 @@ "docnm": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "docnm_kwd, title_tks, title_sm_tks"}, "name_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "tag_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, + "important_kwd_empty_count": {"type": "integer", "default": 0}, "important_keywords": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "important_kwd, important_tks"}, "questions": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "question_kwd, question_tks"}, "content": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "content_with_weight, content_ltks, content_sm_ltks"}, diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py index ac5129735..f65ae3eaf 100644 --- a/rag/utils/infinity_conn.py +++ b/rag/utils/infinity_conn.py @@ -42,6 +42,7 @@ class InfinityConnection(InfinityConnectionBase): return False def convert_select_fields(self, output_fields: list[str]) -> list[str]: + need_empty_count = "important_kwd" in output_fields for i, field in enumerate(output_fields): if field in ["docnm_kwd", "title_tks", "title_sm_tks"]: output_fields[i] = "docnm" @@ -53,6 +54,8 @@ class InfinityConnection(InfinityConnectionBase): output_fields[i] = "content" elif field in ["authors_tks", "authors_sm_tks"]: output_fields[i] = "authors" + if need_empty_count and "important_kwd_empty_count" not in output_fields: + output_fields.append("important_kwd_empty_count") return list(set(output_fields)) @staticmethod @@ -340,7 +343,13 @@ class InfinityConnection(InfinityConnectionBase): if not d.get("docnm_kwd"): d["docnm"] = self.list2str(v) elif k == "important_kwd": - d["important_keywords"] = self.list2str(v, ",") + if isinstance(v, list): + empty_count = sum(1 for kw in v if kw == "") + tokens = [kw for kw in v if kw != ""] + d["important_keywords"] = self.list2str(tokens, ",") + d["important_kwd_empty_count"] = empty_count + else: + d["important_keywords"] = self.list2str(v, ",") elif k == "important_tks": if not d.get("important_kwd"): d["important_keywords"] = v @@ -429,7 +438,13 @@ class InfinityConnection(InfinityConnectionBase): if not new_value.get("docnm_kwd"): new_value["docnm"] = v elif k == "important_kwd": - new_value["important_keywords"] = self.list2str(v, ",") + if isinstance(v, list): + empty_count = sum(1 for kw in v if kw == "") + tokens = [kw for kw in v if kw != ""] + new_value["important_keywords"] = self.list2str(tokens, ",") + new_value["important_kwd_empty_count"] = empty_count + else: + new_value["important_keywords"] = self.list2str(v, ",") elif k == "important_tks": if not new_value.get("important_kwd"): new_value["important_keywords"] = v @@ -532,7 +547,15 @@ class InfinityConnection(InfinityConnectionBase): res[field] = res["docnm"] if "important_keywords" in res.columns: if "important_kwd" in fields_all: - res["important_kwd"] = res["important_keywords"].apply(lambda v: v.split(",") if v else []) + if "important_kwd_empty_count" in res.columns: + base = res["important_keywords"].apply(lambda raw: raw.split(",") if raw else []) + counts = res["important_kwd_empty_count"].fillna(0).astype(int) + res["important_kwd"] = [ + tokens + [""] * empty_count + for tokens, empty_count in zip(base.tolist(), counts.tolist()) + ] + else: + res["important_kwd"] = res["important_keywords"].apply(lambda v: v.split(",") if v else []) if "important_tks" in fields_all: res["important_tks"] = res["important_keywords"] if "questions" in res.columns: