[Frontend] Require flag for loading text and image embeds (#27204)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
@ -12,8 +13,22 @@ def test_empty_prompt():
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="out of vocabulary"):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
|
||||
def test_require_mm_embeds():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
enforce_eager=True,
|
||||
enable_mm_embeds=False,
|
||||
)
|
||||
with pytest.raises(ValueError, match="--enable-mm-embeds"):
|
||||
llm.generate(
|
||||
{
|
||||
"prompt": "<image>",
|
||||
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
|
||||
}
|
||||
)
|
||||
|
||||
@ -292,3 +292,16 @@ async def test_prompt_logprobs_raises_error(
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
) -> None:
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": []},
|
||||
)
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
from unittest.mock import Mock
|
||||
|
||||
# imports for structured outputs tests
|
||||
import openai
|
||||
@ -10,7 +11,8 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.renderer import BaseRenderer
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.renderer import CompletionRenderer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -59,6 +61,10 @@ async def test_out_of_vocab_token_ids():
|
||||
def test_load_prompt_embeds(
|
||||
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
|
||||
):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = True
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
# construct arbitrary tensors of various dtypes, layouts, and sizes.
|
||||
# We need to check against different layouts to make sure that if a user
|
||||
# uses sparse tensors to reduce the transmission size of prompt embeddings,
|
||||
@ -83,7 +89,7 @@ def test_load_prompt_embeds(
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
|
||||
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
|
||||
assert len(loaded_prompt_embeds) == 1
|
||||
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
|
||||
assert loaded_tensor.device.type == "cpu"
|
||||
@ -91,3 +97,22 @@ def test_load_prompt_embeds(
|
||||
torch.testing.assert_close(
|
||||
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float32])
|
||||
@pytest.mark.parametrize("seq_len", [2])
|
||||
@pytest.mark.parametrize("hidden_size", [2])
|
||||
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.enable_prompt_embeds = False
|
||||
renderer = CompletionRenderer(model_config, tokenizer=None)
|
||||
|
||||
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(tensor, buffer)
|
||||
buffer.seek(0)
|
||||
encoded_tensor = pybase64.b64encode(buffer.getvalue())
|
||||
|
||||
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
|
||||
renderer.load_prompt_embeds(encoded_tensor)
|
||||
|
||||
@ -15,30 +15,7 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--skip-tokenizer-init",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--model-impl",
|
||||
"terratorch",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||
def _terratorch_dummy_inputs(model_name: str):
|
||||
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||
|
||||
@ -54,7 +31,7 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||
binary_data = buffer_coord.read()
|
||||
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
|
||||
prompt = {
|
||||
return {
|
||||
"model": model_name,
|
||||
"additional_data": {"prompt_token_ids": [1]},
|
||||
"encoding_format": "base64",
|
||||
@ -74,12 +51,33 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||
],
|
||||
}
|
||||
|
||||
# test single pooling
|
||||
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]["data"]
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_request(model_name: str):
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--model-impl",
|
||||
"terratorch",
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
prompt = _terratorch_dummy_inputs(model_name)
|
||||
|
||||
assert len(np_response) == 524288
|
||||
# test single pooling
|
||||
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]["data"]
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
assert len(np_response) == 524288
|
||||
@ -73,6 +73,19 @@ def phi3v_model_config_mm_interleaved():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def phi3v_model_config_image_embeds():
|
||||
return ModelConfig(
|
||||
PHI3V_MODEL_ID,
|
||||
runner="generate",
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def phi3v_tokenizer():
|
||||
return get_tokenizer(PHI3V_MODEL_ID)
|
||||
@ -799,7 +812,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||
|
||||
|
||||
def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
uuid = "abcd"
|
||||
@ -813,7 +826,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
@ -832,7 +845,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
uuid = "abcd"
|
||||
@ -846,7 +859,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -17,6 +17,7 @@ from vllm.inputs.data import is_embeds_prompt
|
||||
class MockModelConfig:
|
||||
max_model_len: int = 100
|
||||
encoder_config: dict | None = None
|
||||
enable_prompt_embeds: bool = True
|
||||
|
||||
|
||||
class MockTokenizerResult:
|
||||
|
||||
Reference in New Issue
Block a user