Fix missmatch docnm_kwd in raptor chunks (#13451)

### What problem does this PR solve?

issue #13393 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Idriss Sbaaoui
2026-03-10 14:24:33 +08:00
committed by GitHub
parent 185ab0d4ef
commit 249b78561b

View File

@ -778,6 +778,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
res = []
tk_count = 0
max_errors = int(os.environ.get("RAPTOR_MAX_ERRORS", 3))
doc_name_by_id = {}
for doc_id in set(doc_ids):
ok, source_doc = DocumentService.get_by_id(doc_id)
if not ok or not source_doc:
continue
source_name = getattr(source_doc, "name", "")
if source_name:
doc_name_by_id[doc_id] = source_name
async def generate(chunks, did):
nonlocal tk_count, res
@ -792,11 +800,12 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
)
original_length = len(chunks)
chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"])
doc = {
"doc_id": did,
"kb_id": [str(row["kb_id"])],
"docnm_kwd": row["name"],
"title_tks": rag_tokenizer.tokenize(row["name"]),
"docnm_kwd": effective_doc_name,
"title_tks": rag_tokenizer.tokenize(effective_doc_name),
"raptor_kwd": "raptor"
}
if row["pagerank"]:
@ -1047,7 +1056,7 @@ async def do_handle_task(task):
return
# bind LLM for raptor
chat_model_config = get_model_config_by_type_and_name(task_dataset_id, LLMType.CHAT, kb_task_llm_id)
chat_model_config = get_model_config_by_type_and_name(task_tenant_id, LLMType.CHAT, kb_task_llm_id)
chat_model = LLMBundle(task_tenant_id, chat_model_config, lang=task_language)
# run RAPTOR
async with kg_limiter: