[Frontend] Require flag for loading text and image embeds (#27204)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Russell Bryant
2025-10-22 11:52:02 -04:00
committed by GitHub
parent db6f28d898
commit 58fab50d82
25 changed files with 203 additions and 64 deletions

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
@ -12,8 +13,22 @@ def test_empty_prompt():
llm.generate([""])
@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="out of vocabulary"):
llm.generate({"prompt_token_ids": [999999]})
def test_require_mm_embeds():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
enforce_eager=True,
enable_mm_embeds=False,
)
with pytest.raises(ValueError, match="--enable-mm-embeds"):
llm.generate(
{
"prompt": "<image>",
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
}
)

View File

@ -292,3 +292,16 @@ async def test_prompt_logprobs_raises_error(
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
)
@pytest.mark.asyncio
async def test_empty_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI,
) -> None:
await client_with_prompt_embeds.completions.create(
model=MODEL_NAME,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": []},
)

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
from unittest.mock import Mock
# imports for structured outputs tests
import openai
@ -10,7 +11,8 @@ import pytest
import regex as re
import torch
from vllm.entrypoints.renderer import BaseRenderer
from vllm.config import ModelConfig
from vllm.entrypoints.renderer import CompletionRenderer
from ...utils import RemoteOpenAIServer
@ -59,6 +61,10 @@ async def test_out_of_vocab_token_ids():
def test_load_prompt_embeds(
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = True
renderer = CompletionRenderer(model_config, tokenizer=None)
# construct arbitrary tensors of various dtypes, layouts, and sizes.
# We need to check against different layouts to make sure that if a user
# uses sparse tensors to reduce the transmission size of prompt embeddings,
@ -83,7 +89,7 @@ def test_load_prompt_embeds(
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())
loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
assert len(loaded_prompt_embeds) == 1
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
assert loaded_tensor.device.type == "cpu"
@ -91,3 +97,22 @@ def test_load_prompt_embeds(
torch.testing.assert_close(
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
)
@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = False
renderer = CompletionRenderer(model_config, tokenizer=None)
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
renderer.load_prompt_embeds(encoded_tensor)

View File

@ -15,30 +15,7 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32",
"--model-impl",
"terratorch",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
def _terratorch_dummy_inputs(model_name: str):
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
@ -54,7 +31,7 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
prompt = {
return {
"model": model_name,
"additional_data": {"prompt_token_ids": [1]},
"encoding_format": "base64",
@ -74,12 +51,33 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
],
}
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()
output = response.json()["data"][0]["data"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(model_name: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--max-num-seqs",
"32",
"--model-impl",
"terratorch",
"--skip-tokenizer-init",
"--enable-mm-embeds",
]
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
with RemoteOpenAIServer(MODEL_NAME, args) as server:
prompt = _terratorch_dummy_inputs(model_name)
assert len(np_response) == 524288
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()
output = response.json()["data"][0]["data"]
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288

View File

@ -73,6 +73,19 @@ def phi3v_model_config_mm_interleaved():
)
@pytest.fixture(scope="function")
def phi3v_model_config_image_embeds():
return ModelConfig(
PHI3V_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"image": 2,
},
enable_mm_embeds=True,
)
@pytest.fixture(scope="module")
def phi3v_tokenizer():
return get_tokenizer(PHI3V_MODEL_ID)
@ -799,7 +812,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
def test_parse_chat_messages_empty_image_embeds_with_uuid(
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
@ -813,7 +826,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
],
}
],
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)
@ -832,7 +845,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
@ -846,7 +859,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
],
}
],
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)

View File

@ -17,6 +17,7 @@ from vllm.inputs.data import is_embeds_prompt
class MockModelConfig:
max_model_len: int = 100
encoder_config: dict | None = None
enable_prompt_embeds: bool = True
class MockTokenizerResult: