[Bugfix] Fix SHM cache initialization (#26427)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -113,15 +113,17 @@ def mock_serving_setup():
|
||||
mock_engine.generate.reset_mock()
|
||||
mock_engine.add_lora.reset_mock()
|
||||
|
||||
mock_model_config = MockModelConfig()
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
models = OpenAIServingModels(
|
||||
engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config,
|
||||
)
|
||||
|
||||
serving_completion = OpenAIServingCompletion(
|
||||
mock_engine, mock_model_config, models, request_logger=None
|
||||
mock_engine, models, request_logger=None
|
||||
)
|
||||
|
||||
serving_completion._process_inputs = AsyncMock(
|
||||
|
||||
@ -245,17 +245,13 @@ class MockModelConfig:
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
def _build_serving_chat(
|
||||
engine: AsyncLLM, model_config: MockModelConfig
|
||||
) -> OpenAIServingChat:
|
||||
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
models = OpenAIServingModels(
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=model_config,
|
||||
)
|
||||
serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
@ -280,18 +276,17 @@ def _build_serving_chat(
|
||||
|
||||
@dataclass
|
||||
class MockEngine:
|
||||
async def get_model_config(self):
|
||||
return MockModelConfig()
|
||||
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
||||
processor: MagicMock = field(default_factory=MagicMock)
|
||||
io_processor: MagicMock = field(default_factory=MagicMock)
|
||||
|
||||
|
||||
async def _async_serving_chat_init():
|
||||
engine = MockEngine()
|
||||
model_config = await engine.get_model_config()
|
||||
|
||||
models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
|
||||
models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
|
||||
serving_completion = OpenAIServingChat(
|
||||
engine,
|
||||
model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
@ -311,8 +306,11 @@ async def test_serving_chat_returns_correct_model_name():
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
messages = [{"role": "user", "content": "what is 1+1?"}]
|
||||
|
||||
async def return_model_name(*args):
|
||||
@ -338,8 +336,11 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@ -368,9 +369,12 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
# Test Case 1: No max_tokens specified in request
|
||||
req = ChatCompletionRequest(
|
||||
@ -410,9 +414,12 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
# Test case 1: No max_tokens specified, defaults to context_window
|
||||
req = ChatCompletionRequest(
|
||||
@ -453,9 +460,12 @@ async def test_serving_chat_could_load_correct_generation_config():
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@ -496,8 +506,11 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
# Test cache_salt
|
||||
req = ChatCompletionRequest(
|
||||
|
||||
@ -22,10 +22,12 @@ def serving() -> OpenAIServing:
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.max_model_len = 32768
|
||||
models = Mock(spec=OpenAIServingModels)
|
||||
models.model_config = model_config
|
||||
models.processor = Mock()
|
||||
models.io_processor = Mock()
|
||||
|
||||
serving = OpenAIServing(
|
||||
engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
@ -25,15 +25,17 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||
|
||||
|
||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
mock_engine_client = MagicMock(spec=EngineClient)
|
||||
# Set the max_model_len attribute to avoid missing attribute
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
mock_model_config.max_model_len = 2048
|
||||
mock_engine_client.model_config = mock_model_config
|
||||
mock_engine_client.processor = MagicMock()
|
||||
mock_engine_client.io_processor = MagicMock()
|
||||
|
||||
serving_models = OpenAIServingModels(
|
||||
engine_client=mock_engine_client,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config,
|
||||
lora_modules=None,
|
||||
)
|
||||
await serving_models.init_static_loras()
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import AsyncExitStack
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
@ -70,11 +70,14 @@ class TestInitializeToolSessions:
|
||||
"""Create a real OpenAIServingResponses instance for testing"""
|
||||
# Create minimal mocks for required dependencies
|
||||
engine_client = MagicMock()
|
||||
engine_client.get_model_config = AsyncMock()
|
||||
|
||||
model_config = MagicMock()
|
||||
model_config.hf_config.model_type = "test"
|
||||
model_config.get_diff_sampling_param.return_value = {}
|
||||
engine_client.model_config = model_config
|
||||
|
||||
engine_client.processor = MagicMock()
|
||||
engine_client.io_processor = MagicMock()
|
||||
|
||||
models = MagicMock()
|
||||
|
||||
@ -83,7 +86,6 @@ class TestInitializeToolSessions:
|
||||
# Create the actual instance
|
||||
instance = OpenAIServingResponses(
|
||||
engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
@ -132,18 +134,20 @@ class TestValidateGeneratorInput:
|
||||
"""Create a real OpenAIServingResponses instance for testing"""
|
||||
# Create minimal mocks for required dependencies
|
||||
engine_client = MagicMock()
|
||||
engine_client.get_model_config = AsyncMock()
|
||||
|
||||
model_config = MagicMock()
|
||||
model_config.hf_config.model_type = "test"
|
||||
model_config.get_diff_sampling_param.return_value = {}
|
||||
engine_client.model_config = model_config
|
||||
|
||||
engine_client.processor = MagicMock()
|
||||
engine_client.io_processor = MagicMock()
|
||||
|
||||
models = MagicMock()
|
||||
|
||||
# Create the actual instance
|
||||
instance = OpenAIServingResponses(
|
||||
engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
|
||||
Reference in New Issue
Block a user