From 5aefd6ac3169b7b56023549cfa9614274d6e15f0 Mon Sep 17 00:00:00 2001 From: daniel-salib Date: Tue, 25 Mar 2025 22:29:54 -0700 Subject: [PATCH] Fix raw_request extraction in load_aware_call decorator (#15382) Signed-off-by: Daniel Salib --- vllm/entrypoints/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 60cbb58af3..773f52fa38 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -68,13 +68,20 @@ def decrement_server_load(request: Request): def load_aware_call(func): @functools.wraps(func) - async def wrapper(*args, raw_request: Request, **kwargs): + async def wrapper(*args, **kwargs): + raw_request = kwargs.get("raw_request", + args[1] if len(args) > 1 else None) + + if raw_request is None: + raise ValueError( + "raw_request required when server load tracking is enabled") + if not raw_request.app.state.enable_server_load_tracking: - return await func(*args, raw_request=raw_request, **kwargs) + return await func(*args, **kwargs) raw_request.app.state.server_load_metrics += 1 try: - response = await func(*args, raw_request=raw_request, **kwargs) + response = await func(*args, **kwargs) except Exception: raw_request.app.state.server_load_metrics -= 1 raise