mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 03:35:11 +08:00
Fix: Infinity keyword round-trip, highlight fallback, and KB update guards (#12660)
### What problem does this PR solve? Fixes Infinity-specific API regressions: preserves ```important_kwd``` round‑trip for ```[""]```, restores required highlight key in retrieval responses, and enforces Infinity guards for unsupported ```parser_id=tag``` and pagerank in ```/v1/kb/update```. Also removes a slow/buggy pandas row-wise apply that was throwing ```ValueError``` and causing flakiness. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -82,6 +82,20 @@ async def update():
|
||||
return get_data_error_result(
|
||||
message=f"Dataset name length is {len(req['name'])} which is large than {DATASET_NAME_LIMIT}")
|
||||
req["name"] = req["name"].strip()
|
||||
if settings.DOC_ENGINE_INFINITY:
|
||||
parser_id = req.get("parser_id")
|
||||
if isinstance(parser_id, str) and parser_id.lower() == "tag":
|
||||
return get_json_result(
|
||||
code=RetCode.OPERATING_ERROR,
|
||||
message="The chunking method Tag has not been supported by Infinity yet.",
|
||||
data=False,
|
||||
)
|
||||
if "pagerank" in req:
|
||||
return get_json_result(
|
||||
code=RetCode.DATA_ERROR,
|
||||
message="'pagerank' can only be set when doc_engine is elasticsearch",
|
||||
data=False,
|
||||
)
|
||||
|
||||
if not KnowledgebaseService.accessible4deletion(req["kb_id"], current_user.id):
|
||||
return get_json_result(
|
||||
|
||||
@ -367,6 +367,9 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
num_rows = len(res)
|
||||
column_id = res["id"]
|
||||
if field_name not in res:
|
||||
if field_name == "content_with_weight" and "content" in res:
|
||||
field_name = "content"
|
||||
else:
|
||||
return {}
|
||||
for i in range(num_rows):
|
||||
id = column_id[i]
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
"docnm": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "docnm_kwd, title_tks, title_sm_tks"},
|
||||
"name_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"tag_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"important_kwd_empty_count": {"type": "integer", "default": 0},
|
||||
"important_keywords": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "important_kwd, important_tks"},
|
||||
"questions": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "question_kwd, question_tks"},
|
||||
"content": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "content_with_weight, content_ltks, content_sm_ltks"},
|
||||
|
||||
@ -42,6 +42,7 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
return False
|
||||
|
||||
def convert_select_fields(self, output_fields: list[str]) -> list[str]:
|
||||
need_empty_count = "important_kwd" in output_fields
|
||||
for i, field in enumerate(output_fields):
|
||||
if field in ["docnm_kwd", "title_tks", "title_sm_tks"]:
|
||||
output_fields[i] = "docnm"
|
||||
@ -53,6 +54,8 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
output_fields[i] = "content"
|
||||
elif field in ["authors_tks", "authors_sm_tks"]:
|
||||
output_fields[i] = "authors"
|
||||
if need_empty_count and "important_kwd_empty_count" not in output_fields:
|
||||
output_fields.append("important_kwd_empty_count")
|
||||
return list(set(output_fields))
|
||||
|
||||
@staticmethod
|
||||
@ -340,6 +343,12 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
if not d.get("docnm_kwd"):
|
||||
d["docnm"] = self.list2str(v)
|
||||
elif k == "important_kwd":
|
||||
if isinstance(v, list):
|
||||
empty_count = sum(1 for kw in v if kw == "")
|
||||
tokens = [kw for kw in v if kw != ""]
|
||||
d["important_keywords"] = self.list2str(tokens, ",")
|
||||
d["important_kwd_empty_count"] = empty_count
|
||||
else:
|
||||
d["important_keywords"] = self.list2str(v, ",")
|
||||
elif k == "important_tks":
|
||||
if not d.get("important_kwd"):
|
||||
@ -429,6 +438,12 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
if not new_value.get("docnm_kwd"):
|
||||
new_value["docnm"] = v
|
||||
elif k == "important_kwd":
|
||||
if isinstance(v, list):
|
||||
empty_count = sum(1 for kw in v if kw == "")
|
||||
tokens = [kw for kw in v if kw != ""]
|
||||
new_value["important_keywords"] = self.list2str(tokens, ",")
|
||||
new_value["important_kwd_empty_count"] = empty_count
|
||||
else:
|
||||
new_value["important_keywords"] = self.list2str(v, ",")
|
||||
elif k == "important_tks":
|
||||
if not new_value.get("important_kwd"):
|
||||
@ -532,6 +547,14 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
res[field] = res["docnm"]
|
||||
if "important_keywords" in res.columns:
|
||||
if "important_kwd" in fields_all:
|
||||
if "important_kwd_empty_count" in res.columns:
|
||||
base = res["important_keywords"].apply(lambda raw: raw.split(",") if raw else [])
|
||||
counts = res["important_kwd_empty_count"].fillna(0).astype(int)
|
||||
res["important_kwd"] = [
|
||||
tokens + [""] * empty_count
|
||||
for tokens, empty_count in zip(base.tolist(), counts.tolist())
|
||||
]
|
||||
else:
|
||||
res["important_kwd"] = res["important_keywords"].apply(lambda v: v.split(",") if v else [])
|
||||
if "important_tks" in fields_all:
|
||||
res["important_tks"] = res["important_keywords"]
|
||||
|
||||
Reference in New Issue
Block a user