From 9070408b04eaa2a5cc2fbf17932402e4e2abf23d Mon Sep 17 00:00:00 2001 From: Idriss Sbaaoui <112825897+6ba3i@users.noreply.github.com> Date: Wed, 18 Mar 2026 17:28:20 +0800 Subject: [PATCH] Fix : model-specific handling (#13675) ### What problem does this PR solve? add a handler for gpt 5 models that do not accept parameters by dropping them, and centralize all models with specific paramter handling function into a single helper. solves issue #13639 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --- rag/llm/chat_model.py | 103 ++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 4476ccbbd..8880d4d61 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -60,6 +60,58 @@ LENGTH_NOTIFICATION_CN = "······\n由于大模型的上下文窗口大小 LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to its limitation on context length." +def _apply_model_family_policies( + model_name: str, + *, + backend: str, + provider: SupportedLiteLLMProvider | str | None = None, + gen_conf: dict | None = None, + request_kwargs: dict | None = None, +): + model_name_lower = (model_name or "").lower() + sanitized_gen_conf = deepcopy(gen_conf) if gen_conf else {} + sanitized_kwargs = dict(request_kwargs) if request_kwargs else {} + + # Qwen3 family disables thinking by extra_body on non-stream chat requests. + if "qwen3" in model_name_lower: + sanitized_kwargs["extra_body"] = {"enable_thinking": False} + + if backend == "base": + # GPT-5 and GPT-5.1 endpoints in this path have inconsistent generation-param support. + if "gpt-5" in model_name_lower: + sanitized_gen_conf = {} + return sanitized_gen_conf, sanitized_kwargs + + if backend == "litellm": + if provider in {SupportedLiteLLMProvider.OpenAI, SupportedLiteLLMProvider.Azure_OpenAI} and "gpt-5" in model_name_lower: + for key in ("temperature", "top_p", "logprobs", "top_logprobs"): + sanitized_gen_conf.pop(key, None) + sanitized_kwargs.pop(key, None) + + if provider == SupportedLiteLLMProvider.HunYuan: + for key in ("presence_penalty", "frequency_penalty"): + sanitized_gen_conf.pop(key, None) + elif "kimi-k2.5" in model_name_lower: + reasoning = sanitized_gen_conf.pop("reasoning", None) + thinking = {"type": "enabled"} + if reasoning is not None: + thinking = {"type": "enabled"} if reasoning else {"type": "disabled"} + elif not isinstance(thinking, dict) or thinking.get("type") not in {"enabled", "disabled"}: + thinking = {"type": "disabled"} + sanitized_gen_conf["thinking"] = thinking + + thinking_enabled = thinking.get("type") == "enabled" + sanitized_gen_conf["temperature"] = 1.0 if thinking_enabled else 0.6 + sanitized_gen_conf["top_p"] = 0.95 + sanitized_gen_conf["n"] = 1 + sanitized_gen_conf["presence_penalty"] = 0.0 + sanitized_gen_conf["frequency_penalty"] = 0.0 + + return sanitized_gen_conf, sanitized_kwargs + + return sanitized_gen_conf, sanitized_kwargs + + class Base(ABC): def __init__(self, key, model_name, base_url, **kwargs): timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600)) @@ -100,9 +152,12 @@ class Base(ABC): def _clean_conf(self, gen_conf): model_name_lower = (self.model_name or "").lower() - # gpt-5 and gpt-5.1 endpoints have inconsistent parameter support, clear custom generation params to prevent unexpected issues + gen_conf, _ = _apply_model_family_policies( + self.model_name, + backend="base", + gen_conf=gen_conf, + ) if "gpt-5" in model_name_lower: - gen_conf = {} return gen_conf if "max_tokens" in gen_conf: @@ -461,8 +516,11 @@ class Base(ABC): return final_ans.strip(), tol_token - if self.model_name.lower().find("qwen3") >= 0: - kwargs["extra_body"] = {"enable_thinking": False} + _, kwargs = _apply_model_family_policies( + self.model_name, + backend="base", + request_kwargs=kwargs, + ) response = await self.async_client.chat.completions.create(model=self.model_name, messages=history, **gen_conf, **kwargs) @@ -1193,28 +1251,12 @@ class LiteLLMBase(ABC): return LLMErrorCode.ERROR_GENERIC def _clean_conf(self, gen_conf): - gen_conf = deepcopy(gen_conf) if gen_conf else {} - - if self.provider == SupportedLiteLLMProvider.HunYuan: - unsupported = ["presence_penalty", "frequency_penalty"] - for key in unsupported: - gen_conf.pop(key, None) - - elif "kimi-k2.5" in self.model_name.lower(): - reasoning = gen_conf.pop("reasoning", None) # will never get one here, handle this later - thinking = {"type": "enabled"} # enable thinking by default - if reasoning is not None: - thinking = {"type": "enabled"} if reasoning else {"type": "disabled"} - elif not isinstance(thinking, dict) or thinking.get("type") not in {"enabled", "disabled"}: - thinking = {"type": "disabled"} - gen_conf["thinking"] = thinking - - thinking_enabled = thinking.get("type") == "enabled" - gen_conf["temperature"] = 1.0 if thinking_enabled else 0.6 - gen_conf["top_p"] = 0.95 - gen_conf["n"] = 1 - gen_conf["presence_penalty"] = 0.0 - gen_conf["frequency_penalty"] = 0.0 + gen_conf, _ = _apply_model_family_policies( + self.model_name, + backend="litellm", + provider=self.provider, + gen_conf=gen_conf, + ) gen_conf.pop("max_tokens", None) return gen_conf @@ -1226,8 +1268,13 @@ class LiteLLMBase(ABC): hist.insert(0, {"role": "system", "content": system}) logging.info("[HISTORY]" + json.dumps(hist, ensure_ascii=False, indent=2)) - if self.model_name.lower().find("qwen3") >= 0: - kwargs["extra_body"] = {"enable_thinking": False} + gen_conf = self._clean_conf(gen_conf) + _, kwargs = _apply_model_family_policies( + self.model_name, + backend="litellm", + provider=self.provider, + request_kwargs=kwargs, + ) completion_args = self._construct_completion_args(history=hist, stream=False, tools=False, **{**gen_conf, **kwargs})