[Bugfix] Fix SHM cache initialization (#26427)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-09 17:48:04 +08:00
committed by GitHub
parent dc7976dd9f
commit 4bdf7ac593
30 changed files with 357 additions and 417 deletions

View File

@ -113,15 +113,17 @@ def mock_serving_setup():
mock_engine.generate.reset_mock()
mock_engine.add_lora.reset_mock()
mock_model_config = MockModelConfig()
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
models = OpenAIServingModels(
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config,
)
serving_completion = OpenAIServingCompletion(
mock_engine, mock_model_config, models, request_logger=None
mock_engine, models, request_logger=None
)
serving_completion._process_inputs = AsyncMock(

View File

@ -245,17 +245,13 @@ class MockModelConfig:
return self.diff_sampling_param or {}
def _build_serving_chat(
engine: AsyncLLM, model_config: MockModelConfig
) -> OpenAIServingChat:
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=model_config,
)
serving_chat = OpenAIServingChat(
engine,
model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
@ -280,18 +276,17 @@ def _build_serving_chat(
@dataclass
class MockEngine:
async def get_model_config(self):
return MockModelConfig()
model_config: MockModelConfig = field(default_factory=MockModelConfig)
processor: MagicMock = field(default_factory=MagicMock)
io_processor: MagicMock = field(default_factory=MagicMock)
async def _async_serving_chat_init():
engine = MockEngine()
model_config = await engine.get_model_config()
models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
serving_completion = OpenAIServingChat(
engine,
model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
@ -311,8 +306,11 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
serving_chat = _build_serving_chat(mock_engine)
messages = [{"role": "user", "content": "what is 1+1?"}]
async def return_model_name(*args):
@ -338,8 +336,11 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
serving_chat = _build_serving_chat(mock_engine)
req = ChatCompletionRequest(
model=MODEL_NAME,
@ -368,9 +369,12 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
serving_chat = _build_serving_chat(mock_engine)
# Test Case 1: No max_tokens specified in request
req = ChatCompletionRequest(
@ -410,9 +414,12 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
serving_chat = _build_serving_chat(mock_engine)
# Test case 1: No max_tokens specified, defaults to context_window
req = ChatCompletionRequest(
@ -453,9 +460,12 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
serving_chat = _build_serving_chat(mock_engine)
req = ChatCompletionRequest(
model=MODEL_NAME,
@ -496,8 +506,11 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
serving_chat = _build_serving_chat(mock_engine)
# Test cache_salt
req = ChatCompletionRequest(

View File

@ -22,10 +22,12 @@ def serving() -> OpenAIServing:
model_config = Mock(spec=ModelConfig)
model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.processor = Mock()
models.io_processor = Mock()
serving = OpenAIServing(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=None,
)

View File

@ -25,15 +25,17 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
async def _async_serving_models_init() -> OpenAIServingModels:
mock_model_config = MagicMock(spec=ModelConfig)
mock_engine_client = MagicMock(spec=EngineClient)
# Set the max_model_len attribute to avoid missing attribute
mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048
mock_engine_client.model_config = mock_model_config
mock_engine_client.processor = MagicMock()
mock_engine_client.io_processor = MagicMock()
serving_models = OpenAIServingModels(
engine_client=mock_engine_client,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config,
lora_modules=None,
)
await serving_models.init_static_loras()

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import AsyncExitStack
from unittest.mock import AsyncMock, MagicMock
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
@ -70,11 +70,14 @@ class TestInitializeToolSessions:
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
engine_client.get_model_config = AsyncMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
@ -83,7 +86,6 @@ class TestInitializeToolSessions:
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=None,
chat_template=None,
@ -132,18 +134,20 @@ class TestValidateGeneratorInput:
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
engine_client.get_model_config = AsyncMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=None,
chat_template=None,