Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
pytest.param(
"openbmb/MiniCPM3-4B",
# fused_moe not supported on CPU
marks=[pytest.mark.core_model,
large_gpu_mark(min_gb=32)],
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
),
pytest.param(
"facebook/opt-125m", # opt
@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
marks=[pytest.mark.cpu_model],
),
pytest.param("swiss-ai/Apertus-8B-2509"), # apertus
])
],
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
use_prompt_embeds: bool, monkeypatch) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
use_rocm_aiter: bool,
use_prompt_embeds: bool,
monkeypatch,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
else None)
prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
prompt_token_ids = []
for prompt in example_prompts:
token_ids = hf_model.tokenizer(prompt,
return_tensors="pt").input_ids.to(
hf_model.model.device)
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
hf_model.model.device
)
prompt_token_ids.append(token_ids)
if prompt_embeds is not None:
prompt_embeds.append(hf_model.model.get_input_embeddings()(
token_ids).squeeze(0))
prompt_embeds.append(
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
)
with vllm_runner(
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
if prompt_embeds is not None:
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
prompt_embeds, max_tokens, num_logprobs)
prompt_embeds, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,

View File

@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(
model,
load_format="dummy",
model,
load_format="dummy",
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.
normalizer.cpu().item())
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu(
).item())
lambda self: self.model_runner.model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)

View File

@ -26,11 +26,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,

View File

@ -24,7 +24,7 @@ SSM_MODELS = [
"tiiuae/falcon-mamba-tiny-dev",
# mamba2-codestral in transformers is broken pending:
# https://github.com/huggingface/transformers/pull/40861
#"yujiepan/mamba2-codestral-v0.1-tiny-random",
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
]
HYBRID_MODELS = [
@ -65,7 +65,6 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -75,11 +74,13 @@ def test_models(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@ -109,13 +110,14 @@ def test_batching(
for_loop_outputs = []
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
for prompt in example_prompts:
single_output, = vllm_model.generate_greedy_logprobs([prompt],
max_tokens,
num_logprobs)
(single_output,) = vllm_model.generate_greedy_logprobs(
[prompt], max_tokens, num_logprobs
)
for_loop_outputs.append(single_output)
batched_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=for_loop_outputs,
@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
max_tokens: int,
) -> None:
"""
Tests chunked prefill in conjunction with n > 1.
Tests chunked prefill in conjunction with n > 1.
In this case, prefill is populated with decoding tokens and
we test that it doesn't fail.
@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
decoding steps inside a chunked prefill forward pass
(where we have both prefill and decode together)
"""
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
with vllm_runner(
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
batch size. If it's not, a torch RuntimeError will be raised because
tensor dimensions aren't compatible.
"""
vllm_config = EngineArgs(model=model,
trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(
len(example_prompts)):
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly")
"Could be related to mamba cache not padded correctly"
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
pytest.fail(
"Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -215,10 +215,10 @@ def test_state_cleanup(
example_prompts,
model: str,
) -> None:
"""
"""
This test is for verifying that the Hybrid state is cleaned up between
steps.
If it's not cleaned, an error would be expected.
"""
try:
@ -226,8 +226,10 @@ def test_state_cleanup(
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
pytest.fail(
"Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
@multi_gpu_test(num_gpus=2)
@ -241,15 +243,19 @@ def test_distributed_correctness(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1,
@ -271,7 +277,6 @@ def test_full_cuda_graph(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -281,11 +286,13 @@ def test_full_cuda_graph(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@ -298,8 +305,9 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("cache_dtype_param",
["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
@pytest.mark.parametrize(
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
)
def test_fp32_cache_state(
hf_runner,
vllm_runner,
@ -310,7 +318,6 @@ def test_fp32_cache_state(
num_logprobs: int,
cache_dtype_param: str,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -320,13 +327,15 @@ def test_fp32_cache_state(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS,
**{cache_dtype_param: "float32"}) as vllm_model:
with vllm_runner(
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@ -339,21 +348,23 @@ def test_fp32_cache_state(
# Helper functions for the APC tests
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
return {
'model_name': model,
'enable_prefix_caching': False,
'max_model_len': max_model_len,
'tensor_parallel_size': tensor_parallel_size,
'gpu_memory_utilization': 0.4
"model_name": model,
"enable_prefix_caching": False,
"max_model_len": max_model_len,
"tensor_parallel_size": tensor_parallel_size,
"gpu_memory_utilization": 0.4,
}
def _get_vLLM_output(vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None):
def _get_vLLM_output(
vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None,
):
outs = []
if vllm_model is None:
vllm_model = vllm_runner(**kwargs)
@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
else:
vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
prompts, max_tokens, num_logprobs
)
outs.append(vllm_output)
return outs, vllm_model
@ -387,7 +399,6 @@ def test_apc_single_prompt(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -395,29 +406,33 @@ def test_apc_single_prompt(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * example_prompts[0]]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
generated_prompts = ["The president of the United States is " * MULTIPLE]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
prompt_text = "The president of the United States is "
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
generated_prompts = [
prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
]
generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
# Cache only part of all the prompts
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
num_logprobs)
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
)
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0][:3],
@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
name_1="vllm_partial_cache",
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled

View File

@ -6,7 +6,9 @@ import json
import pytest
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolCall, MistralToolParser)
MistralToolCall,
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import MistralTokenizer
@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
]
# for function calling
TOOLS = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
"required": ["city", "state", "unit"],
},
"required": ["city", "state", "unit"]
}
},
},
}, {
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite."
}
}
}
}
}]
{
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite.",
}
},
},
},
},
]
MSGS = [
{"role": "system", "content": "You are an assistant."},
{
"role": "system",
"content": "You are an assistant."
},
{
"role":
"user",
"content":
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
},
{
"role":
"assistant",
"content":
"",
"tool_calls": [{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name":
"rewrite",
"arguments":
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
}
}]
},
{
"role": "tool",
"content":
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite"
"role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors"
"content": "",
"tool_calls": [
{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
},
}
],
},
{
"role":
"user",
"content": ("Can you tell me what the temperate"
" will be in Dallas, in fahrenheit?")
}
"role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite",
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors",
},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
"items": {"type": "string", "maxLength": 10},
"minItems": 3,
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"]
}
}
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"]
"required": ["name", "age", "skills", "work_history"],
}
@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype,
tokenizer_mode="mistral") as vllm_model:
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int, num_logprobs: int) -> None:
def test_mistral_format(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
) as mistral_format_model:
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_format_outputs,
@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_symbolic_languages(vllm_runner, model: str,
dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(
model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
)
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
assert parsed_message.tools_called
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
assert parsed_message.tool_calls[
0].function.name == "get_current_weather"
assert parsed_message.tool_calls[
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
assert (
parsed_message.tool_calls[0].function.arguments
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
) # noqa
assert parsed_message.content is None
@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {
"foo": "bar",
"inner": {
"x": 1,
"y": 2
}
},
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
}
model_output = (
f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
parsed = parser.extract_tool_calls(model_output, None)

View File

@ -15,62 +15,56 @@ MODELS = [
def test_phimoe_routing_function():
from vllm.model_executor.models.phimoe import phimoe_routing_function
test_case = {
0: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.1, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
1: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.4, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
}
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
}
ground_truth = {
0: {
"topk_weights":
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
"topk_weights": torch.tensor(
[1.0, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
},
1: {
"topk_weights":
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
}
"topk_weights": torch.tensor(
[0.5, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
},
}
for test_id in test_case:
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
assert torch.allclose(topk_weights,
ground_truth[test_id]["topk_weights"])
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.")
@pytest.mark.skipif(
condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.",
)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@ -87,11 +81,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,

View File

@ -8,8 +8,7 @@ import torch
from datasets import load_dataset
import tests.ci_envs as ci_envs
from tests.models.utils import (GenerateModelInfo,
TokensTextLogprobsPromptLogprobs)
from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
from vllm.logprobs import Logprob
# See #24485
@ -18,13 +17,14 @@ MAX_LENGTH = 1024
@torch.inference_mode
def wikitext_ppl_test(hf_runner,
vllm_runner,
model_info: GenerateModelInfo,
max_length=MAX_LENGTH,
vllm_extra_kwargs=None,
atol=PPL_TOL):
def wikitext_ppl_test(
hf_runner,
vllm_runner,
model_info: GenerateModelInfo,
max_length=MAX_LENGTH,
vllm_extra_kwargs=None,
atol=PPL_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -44,15 +44,16 @@ def wikitext_ppl_test(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
gpu_memory_utilization=0.7,
max_model_len=max_length,
max_num_seqs=1,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model_info.name,
gpu_memory_utilization=0.7,
max_model_len=max_length,
max_num_seqs=1,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
# Use max_num_seqs=1 to avoid OOM,
# and avoid batch different requests together.
@ -60,7 +61,7 @@ def wikitext_ppl_test(hf_runner,
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_info.architecture in model_config.architectures
max_length = min(model_config.max_model_len - 1, max_length)
stride = max_length
@ -74,12 +75,14 @@ def wikitext_ppl_test(hf_runner,
end_loc = min(begin_loc + max_length, n_tokens)
chunks.append(tokens[begin_loc:end_loc])
outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0,
use_tqdm=False)
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
outputs = vllm_model.generate_greedy_logprobs(
prompts=chunks,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0,
use_tqdm=False,
)
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
n_tokens = 0
for output in outputs:
output = cast(TokensTextLogprobsPromptLogprobs, output)
@ -94,7 +97,8 @@ def wikitext_ppl_test(hf_runner,
token_log_probs.append(token_log_prob)
neg_log_likelihood = -torch.tensor(
token_log_probs, dtype=torch.float32, device="cpu").sum()
token_log_probs, dtype=torch.float32, device="cpu"
).sum()
nll_sum += neg_log_likelihood
n_tokens += len(token_log_probs)
vllm_ppl = float(torch.exp(nll_sum / n_tokens))
@ -104,14 +108,13 @@ def wikitext_ppl_test(hf_runner,
# Accelerate ppl test by setting Transformers ppl score to a constant
if model_info.hf_ppl is None:
with hf_runner(
model_info.name,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
model_info.name,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
n_tokens = 0
for chunk in chunks:
inputs = hf_model.wrap_device(
{"input_ids": torch.tensor([chunk])})
inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
input_ids = inputs["input_ids"]
outputs = hf_model.model(input_ids, labels=input_ids)
neg_log_likelihood = outputs.loss

View File

@ -6,8 +6,7 @@ from typing import Optional
import pytest
from tests.conftest import HfRunner
from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
matryoshka_fy)
from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
def run_embedding_correctness_test(
@ -29,12 +28,14 @@ def run_embedding_correctness_test(
)
def correctness_test_embed_models(hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None):
def correctness_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None,
):
pytest.skip("Debug only, ci prefers to use mteb test.")
# The example_prompts has ending "\n", for example:
@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
with hf_runner(
model_info.name,
dtype=model_info.hf_dtype,
is_sentence_transformer=True,
model_info.name,
dtype=model_info.hf_dtype,
is_sentence_transformer=True,
) as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)

View File

@ -4,8 +4,7 @@ import pytest
import torch
from transformers import AutoModelForSequenceClassification
from tests.models.language.pooling.embed_utils import (
run_embedding_correctness_test)
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
@pytest.mark.parametrize(
@ -20,28 +19,27 @@ def test_classify_models(
model: str,
dtype: str,
) -> None:
example_prompts = example_prompts * 2
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
with vllm_runner(
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert cache_config.enable_prefix_caching
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)
@pytest.mark.parametrize(
@ -59,18 +57,18 @@ def test_embed_models(
example_prompts = [str(s).strip() for s in example_prompts] * 2
with vllm_runner(
model,
runner="pooling",
max_model_len=None,
enable_prefix_caching=True,
model,
runner="pooling",
max_model_len=None,
enable_prefix_caching=True,
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert cache_config.enable_prefix_caching
vllm_outputs = vllm_model.embed(example_prompts)
with hf_runner(
model,
is_sentence_transformer=True,
model,
is_sentence_transformer=True,
) as hf_model:
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
@ -81,13 +79,14 @@ def test_embed_models(
"intfloat/e5-small",
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
"papluca/xlm-roberta-base-language-detection",
])
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
dtype: str) -> None:
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
def test_non_causal_models(
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
) -> None:
with vllm_runner(
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert not cache_config.enable_prefix_caching

View File

@ -10,15 +10,17 @@ from vllm.platforms import current_platform
@pytest.mark.parametrize(
"model",
[
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
]),
pytest.param(
"jason9693/Qwen2.5-1.5B-apeach",
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
],
)
@pytest.mark.parametrize("dtype",
["half"] if current_platform.is_rocm() else ["float"])
@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
def test_models(
hf_runner,
vllm_runner,
@ -35,9 +37,9 @@ def test_models(
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
# check logits difference
@ -48,5 +50,6 @@ def test_models(
# the tolerance value of 1e-2 is selected based on the
# half datatype tests in
# tests/models/language/pooling/test_embedding.py
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)

View File

@ -18,20 +18,25 @@ from ...utils import check_embeddings_close
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
# model code with bidirectional attention.
# [Decoder-only]
pytest.param("BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
pytest.param(
"BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model, pytest.mark.slow_test],
),
pytest.param(
"intfloat/e5-mistral-7b-instruct",
# CPU v1 doesn't support sliding window
marks=[pytest.mark.core_model]),
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.cpu_model]),
marks=[pytest.mark.core_model],
),
pytest.param(
"ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
),
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
@ -50,7 +55,6 @@ def test_models(
model,
monkeypatch,
) -> None:
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
@ -58,13 +62,14 @@ def test_models(
vllm_extra_kwargs = {}
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
vllm_extra_kwargs["pooler_config"] = \
PoolerConfig(pooling_type="MEAN", normalize=False)
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
pooling_type="MEAN", normalize=False
)
max_model_len: Optional[int] = 512
if model in [
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/stsb-roberta-base-v2"
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/stsb-roberta-base-v2",
]:
max_model_len = None
@ -79,10 +84,9 @@ def test_models(
with hf_runner(model, is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model,
runner="pooling",
max_model_len=max_model_len,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
check_embeddings_close(

View File

@ -70,8 +70,9 @@ async def run_client_embeddings(
def gritlm_instruction(instruction):
return ("<|user|>\n" + instruction +
"\n<|embed|>\n" if instruction else "<|embed|>\n")
return (
"<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
)
def get_test_data():
@ -80,7 +81,8 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm
"""
q_instruction = gritlm_instruction(
"Given a scientific paper title, retrieve the paper's abstract", )
"Given a scientific paper title, retrieve the paper's abstract",
)
queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
@ -114,9 +116,9 @@ def test_gritlm_offline_embedding(vllm_runner):
queries, q_instruction, documents, d_instruction = get_test_data()
with vllm_runner(
MODEL_NAME,
runner="pooling",
max_model_len=MAX_MODEL_LEN,
MODEL_NAME,
runner="pooling",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
@ -161,9 +163,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
with vllm_runner(
MODEL_NAME,
runner="generate",
max_model_len=MAX_MODEL_LEN,
MODEL_NAME,
runner="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm

View File

@ -21,16 +21,18 @@ def test_idefics_multimodal(
"The future of AI is",
]
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
runner="pooling",
task="classify",
convert="classify",
load_format="dummy",
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16") as vllm_model:
with vllm_runner(
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
runner="pooling",
task="classify",
convert="classify",
load_format="dummy",
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16",
) as vllm_model:
llm = vllm_model.get_llm()
outputs = llm.classify(prompts)
for output in outputs:
@ -38,19 +40,20 @@ def test_idefics_multimodal(
def update_config(config):
config.text_config.update({
"architectures": ["Gemma3ForSequenceClassification"],
"classifier_from_token": ["A", "B", "C", "D", "E"],
"method":
"no_post_processing",
"id2label": {
"A": "Chair",
"B": "Couch",
"C": "Table",
"D": "Bed",
"E": "Cupboard"
},
})
config.text_config.update(
{
"architectures": ["Gemma3ForSequenceClassification"],
"classifier_from_token": ["A", "B", "C", "D", "E"],
"method": "no_post_processing",
"id2label": {
"A": "Chair",
"B": "Couch",
"C": "Table",
"D": "Bed",
"E": "Cupboard",
},
}
)
return config
@ -63,11 +66,10 @@ def test_gemma_multimodal(
# switch to use ROCm CK FA backend
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
messages = [{
"role":
"system",
"content":
"""
messages = [
{
"role": "system",
"content": """
You are a helpful assistant. You will be given a product description
which may also include an image. Classify the following product into
one of the categories:
@ -78,38 +80,39 @@ def test_gemma_multimodal(
D = bed
E = cupboard
You'll answer with exactly one letter (A, B, C, D, or E)."""
}, {
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
}
}, {
"type": "text",
"text": "A fine 19th century piece of furniture."
}]
}]
with vllm_runner(model_name="google/gemma-3-4b-it",
runner="pooling",
task="classify",
convert="classify",
load_format="auto",
hf_overrides=update_config,
pooler_config=PoolerConfig(pooling_type="LAST"),
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16") as vllm_model:
You'll answer with exactly one letter (A, B, C, D, or E).""",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
},
},
{"type": "text", "text": "A fine 19th century piece of furniture."},
],
},
]
with vllm_runner(
model_name="google/gemma-3-4b-it",
runner="pooling",
task="classify",
convert="classify",
load_format="auto",
hf_overrides=update_config,
pooler_config=PoolerConfig(pooling_type="LAST"),
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16",
) as vllm_model:
llm = vllm_model.get_llm()
prompts = llm.preprocess_chat(messages)
result = llm.classify(prompts)
assert result[0].outputs.probs[0] > 0.95
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
assert all(c < 0.05 for c in result[0].outputs.probs[1:])

View File

@ -20,14 +20,15 @@ def test_classify_models(
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)

View File

@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo
MODELS = [
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
#EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
#EmbedModelInfo("nomic-ai/CodeRankEmbed"),
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
#EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
# EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
]
rope_theta = 1000
@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=None) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
assert model_config.max_model_len == 512
else:
assert (
model_config.max_model_len == original_max_position_embeddings)
assert model_config.max_model_len == original_max_position_embeddings
@pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, runner="pooling",
max_model_len=256) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=256
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=1024):
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
pass
else:
with vllm_runner(model_info.name, runner="pooling",
max_model_len=1024) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=1024
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=4096):
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
pass
# set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass
@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": max_model_len
"max_model_len": max_model_len,
}
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
):
pass
@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
}
"original_max_position_embeddings": original_max_position_embeddings,
},
}
# illegal max_model_len
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides,
):
pass
hf_overrides = {
@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": max_model_len + 1
"max_model_len": max_model_len + 1,
}
# illegal max_model_len by hf_overrides
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass

View File

@ -10,10 +10,7 @@ from vllm.config import PoolerConfig
@pytest.mark.parametrize(
"model",
[
"jason9693/Qwen2.5-1.5B-apeach",
"papluca/xlm-roberta-base-language-detection"
],
["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_classify_models_using_activation(
@ -23,30 +20,32 @@ def test_classify_models_using_activation(
model: str,
dtype: str,
) -> None:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=False)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=False),
) as vllm_model:
wo_activation_out = vllm_model.classify(example_prompts)
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=True)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=True),
) as vllm_model:
w_activation_out = vllm_model.classify(example_prompts)
for wo_activation, w_activation in zip(wo_activation_out,
w_activation_out):
for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
wo_activation = torch.tensor(wo_activation)
w_activation = torch.tensor(w_activation)
assert not torch.allclose(wo_activation, w_activation,
atol=1e-2), "pooler_config is not working"
assert torch.allclose(softmax(wo_activation), w_activation,
1e-3 if dtype == "float" else 1e-2)
assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
"pooler_config is not working"
)
assert torch.allclose(
softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
)
@pytest.mark.parametrize(
@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
model: str,
dtype: str,
) -> None:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
assert not torch.allclose(
wo_normalize, w_normalize,
atol=1e-2), "pooler_config normalize is not working"
assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
"pooler_config normalize is not working"
)
assert torch.allclose(
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
atol=1e-2), "w_normal should be close to normal(wo_normal)."
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
), "w_normal should be close to normal(wo_normal)."
@pytest.mark.parametrize(
@ -99,25 +100,26 @@ def test_reward_models_using_softmax(
model: str,
dtype: str,
) -> None:
with vllm_runner(model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
with vllm_runner(
model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=False),
) as vllm_model:
wo_softmax = vllm_model.encode(example_prompts)
with vllm_runner(model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
with vllm_runner(
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
) as vllm_model:
w_softmax = vllm_model.encode(example_prompts)
for wo, w in zip(wo_softmax, w_softmax):
wo = torch.tensor(wo)
w = torch.tensor(w)
assert not torch.allclose(
wo, w, atol=1e-2), "pooler_config softmax is not working"
assert torch.allclose(
softmax(wo), w,
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
assert not torch.allclose(wo, w, atol=1e-2), (
"pooler_config softmax is not working"
)
assert torch.allclose(softmax(wo), w, atol=1e-2), (
"w_softmax should be close to softmax(wo_softmax)."
)

View File

@ -16,10 +16,8 @@ from ...utils import check_transformers_version
def math_step_prompts():
# ruff: noqa: E501
data = {
"system":
"Please reason step by step, and put your final answer within \\boxed{}. ",
"query":
"Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
"system": "Please reason step by step, and put your final answer within \\boxed{}. ",
"query": "Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
"response": [
"To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
"On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
@ -27,16 +25,16 @@ def math_step_prompts():
"To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
],
}
answer = "<extra_0>".join(data['response']) + "<extra_0>"
answer = "<extra_0>".join(data["response"]) + "<extra_0>"
prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
return [prompt]
def step_reward_patch_hf_model(hf_model: HfRunner):
# Patch the hf_runner to use the step reward function
def make_step_rewards(logits: torch.Tensor,
token_masks: torch.Tensor) -> list[list[float]]:
def make_step_rewards(
logits: torch.Tensor, token_masks: torch.Tensor
) -> list[list[float]]:
probabilities = F.softmax(logits, dim=-1)
probabilities = probabilities * token_masks.unsqueeze(-1)
@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
outputs = hf_model.model(input_ids=input_ids)
step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
token_masks = (input_ids == step_sep_id)
token_masks = input_ids == step_sep_id
return make_step_rewards(outputs[0], token_masks)
hf_model.reward = reward # type: ignore[attr-defined]
@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
@pytest.mark.parametrize(
"model",
[
pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(
"Qwen/Qwen2.5-Math-PRM-7B",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
],
)
@pytest.mark.parametrize("dtype", ["half"])
@ -78,8 +78,9 @@ def test_prm_models(
dtype: str,
monkeypatch,
) -> None:
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
max_transformers_version="4.53.2")
check_transformers_version(
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
)
if current_platform.is_cpu():
pytest.skip("CPU only supports V1")

View File

@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2
@ -101,17 +98,15 @@ def emb_model_name(request):
def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
]
hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
[TEXTS_1[0], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
[TEXTS_1[1], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2

View File

@ -21,9 +21,9 @@ def test_models(
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForTokenClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
for prompt in example_prompts:

View File

@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
field."""
def test_smaller_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_smaller_truncation_size(
vllm_runner, model_name=MODEL_NAME, input_str=input_str
):
truncate_prompt_tokens = 10
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model:
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == truncate_prompt_tokens
def test_max_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
truncate_prompt_tokens = -1
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model:
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == max_model_len
def test_bigger_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_bigger_truncation_size(
vllm_runner, model_name=MODEL_NAME, input_str=input_str
):
truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner(
model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with (
pytest.raises(ValueError),
vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model,
):
llm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
assert llm_output == f"""truncate_prompt_tokens value
assert (
llm_output
== f"""truncate_prompt_tokens value
({truncate_prompt_tokens}) is greater than
max_model_len ({max_model_len}). Please, select
a smaller truncation size."""
)

View File

@ -12,8 +12,7 @@ import requests
import torch
import tests.ci_envs as ci_envs
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
check_embeddings_close)
from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
@ -30,7 +29,6 @@ MTEB_RERANK_TOL = 2e-3
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.llm = vllm_model
@ -53,8 +51,7 @@ class VllmMtebEncoder(mteb.Encoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
@ -64,17 +61,15 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.llm.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
class OpenAIClientMtebEncoder(mteb.Encoder):
def __init__(self, model_name: str, client):
super().__init__()
self.model_name = model_name
@ -87,8 +82,9 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(model=self.model_name,
input=sentences)
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
@ -96,7 +92,6 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
class ScoreClientMtebEncoder(mteb.Encoder):
def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name
@ -105,8 +100,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
@ -122,27 +116,30 @@ class ScoreClientMtebEncoder(mteb.Encoder):
return scores
def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
}).json()
return response['data'][0]["score"]
response = requests.post(
self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
},
).json()
return response["data"][0]["score"]
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
}).json()
return response['results'][0]["relevance_score"]
response = requests.post(
self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
},
).json()
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder, tasks):
@ -161,12 +158,14 @@ def run_mteb_embed_task(encoder, tasks):
return main_score
def mteb_test_embed_models(hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL):
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -187,15 +186,15 @@ def mteb_test_embed_models(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
@ -204,28 +203,29 @@ def mteb_test_embed_models(hf_runner,
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embed_dims, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts,
truncate_prompt_tokens=-1)
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
@ -247,8 +247,7 @@ def mteb_test_embed_models(hf_runner,
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
vllm_main_score)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
@ -282,26 +281,21 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
top_k=10,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=
f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_rerank_models_hf(hf_runner,
model_name,
hf_dtype="float32",
hf_model_callback=None):
with hf_runner(model_name, is_cross_encoder=True,
dtype=hf_dtype) as hf_model:
def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
def _predict(
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
):
@ -315,20 +309,22 @@ def mteb_test_rerank_models_hf(hf_runner,
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_rerank(hf_model,
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
st_main_score = run_mteb_rerank(
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
)
st_dtype = next(hf_model.model.model.parameters()).dtype
return st_main_score, st_dtype
def mteb_test_rerank_models(hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL):
def mteb_test_rerank_models(
hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -346,33 +342,37 @@ def mteb_test_rerank_models(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_info.architecture in model_config.architectures
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
vllm_dtype = model_config.dtype
head_dtype = model_config.head_dtype
@ -380,14 +380,14 @@ def mteb_test_rerank_models(hf_runner,
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
vllm_main_score)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)

View File

@ -2,67 +2,76 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True),
CLSPoolingEmbedModelInfo(
"BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True,
),
########## Qwen2Model
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True),
LASTPoolingEmbedModelInfo(
"BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True,
),
]
RERANK_MODELS = [
@ -71,33 +80,35 @@ RERANK_MODELS = [
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
enable_test=True),
enable_test=True,
),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False),
enable_test=False,
),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False)
enable_test=False,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@ -8,53 +8,50 @@ import torch
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder, mteb_test_rerank_models)
VllmMtebEncoder,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [
LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures":
["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method":
"no_post_processing",
}),
LASTPoolingRerankModelInfo(
"BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
),
]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad()
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None:
prompt = PROMPT
sep = "\n"
prompt_inputs = tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)["input_ids"]
sep_inputs = tokenizer(sep,
return_tensors=None,
add_special_tokens=False)["input_ids"]
prompt_inputs = tokenizer(
prompt, return_tensors=None, add_special_tokens=False
)["input_ids"]
sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
"input_ids"
]
inputs = []
for query, passage in pairs:
query_inputs = tokenizer(
@ -78,8 +75,7 @@ class GemmaRerankerHfRunner(HfRunner):
return_token_type_ids=False,
add_special_tokens=False,
)
item["input_ids"] = item[
"input_ids"] + sep_inputs + prompt_inputs
item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
item["attention_mask"] = [1] * len(item["input_ids"])
inputs.append(item)
return tokenizer.pad(
@ -95,14 +91,19 @@ class GemmaRerankerHfRunner(HfRunner):
inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1]
logits = self.model(**inputs, return_dict=True).logits
_scores = (logits[:, -1,
self.yes_loc].view(-1, ).float().sigmoid())
_scores = (
logits[:, -1, self.yes_loc]
.view(
-1,
)
.float()
.sigmoid()
)
scores.append(_scores[0].item())
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n"
@ -110,12 +111,10 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
@ -127,8 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_mteb_encoder=GemmaMtebEncoder)
mteb_test_rerank_models(
GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_mteb_encoder=GemmaMtebEncoder,
)

View File

@ -2,22 +2,30 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo, RerankModelInfo)
from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification"),
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification")
CLSPoolingRerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification",
),
LASTPoolingRerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification",
),
]
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@ -3,74 +3,93 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True),
CLSPoolingEmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
),
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True),
LASTPoolingEmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True,
),
########## ModernBertModel
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True,
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True),
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False),
LASTPoolingEmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True,
),
LASTPoolingEmbedModelInfo(
"Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False,
),
]
RERANK_MODELS = [
@ -79,31 +98,32 @@ RERANK_MODELS = [
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
enable_test=True),
enable_test=True,
),
CLSPoolingRerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
enable_test=True),
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@ -2,50 +2,55 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/e5-large",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@ -5,60 +5,68 @@ from functools import partial
import pytest
from tests.models.language.pooling.embed_utils import (
check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
RerankModelInfo)
check_embeddings_close,
correctness_test_embed_models,
matryoshka_fy,
)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
RerankModelInfo,
)
from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True)
CLSPoolingEmbedModelInfo(
"jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True,
)
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification")
architecture="XLMRobertaForSequenceClassification",
)
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
def hf_model_callback(model):
model.encode = partial(model.encode, task="text-matching")
mteb_test_embed_models(hf_runner,
vllm_runner,
model_info,
hf_model_callback=hf_model_callback)
mteb_test_embed_models(
hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
def hf_model_callback(model):
model.encode = partial(model.encode, task="text-matching")
correctness_test_embed_models(hf_runner,
vllm_runner,
model_info,
example_prompts,
hf_model_callback=hf_model_callback)
correctness_test_embed_models(
hf_runner,
vllm_runner,
model_info,
example_prompts,
hf_model_callback=hf_model_callback,
)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
@ -81,32 +89,32 @@ def test_matryoshka(
example_prompts = [str(s).strip() for s in example_prompts]
with hf_runner(
model_info.name,
dtype=dtype,
is_sentence_transformer=True,
model_info.name,
dtype=dtype,
is_sentence_transformer=True,
) as hf_model:
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
with vllm_runner(model_info.name,
runner="pooling",
dtype=dtype,
max_model_len=None) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", dtype=dtype, max_model_len=None
) as vllm_model:
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = (
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
)
assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions:
with pytest.raises(ValueError):
vllm_model.embed(
example_prompts,
pooling_params=PoolingParams(dimensions=dimensions))
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
)
else:
vllm_outputs = vllm_model.embed(
example_prompts,
pooling_params=PoolingParams(dimensions=dimensions))
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
)
check_embeddings_close(
embeddings_0_lst=hf_outputs,

View File

@ -17,46 +17,45 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True),
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
enable_test=False)
LASTPoolingRerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True,
),
LASTPoolingRerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
enable_test=False,
),
]
class MxbaiRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(pairs,
padding=False,
truncation='longest_first',
return_attention_mask=False)
for i, ele in enumerate(inputs['input_ids']):
inputs['input_ids'][i] = ele
inputs = self.tokenizer.pad(inputs,
padding=True,
return_tensors="pt")
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs

View File

@ -3,39 +3,42 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
architecture="NomicBertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True)
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@ -18,46 +18,45 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True),
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False)
LASTPoolingRerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True,
),
LASTPoolingRerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False,
),
]
class Qwen3RerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(pairs,
padding=False,
truncation='longest_first',
return_attention_mask=False)
for i, ele in enumerate(inputs['input_ids']):
inputs['input_ids'][i] = ele
inputs = self.tokenizer.pad(inputs,
padding=True,
return_tensors="pt")
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@ -82,20 +81,18 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
@multi_gpu_test(num_gpus=2)
def test_rerank_models_mteb_tp(vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
assert model_info.architecture == "Qwen3ForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"tensor_parallel_size": 2,
}
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs)
mteb_test_rerank_models(
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
)

View File

@ -3,62 +3,75 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@ -2,8 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
)
from .mteb_utils import mteb_test_embed_models
@ -15,15 +18,15 @@ ST_PROJECTOR_MODELS = [
mteb_score=0.688611955,
enable_test=True,
),
LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
enable_test=True)
LASTPoolingEmbedModelInfo(
"google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)

View File

@ -3,27 +3,40 @@
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
import math
import os
from collections import defaultdict
from pathlib import PosixPath
import pytest
from transformers import (AutoModel, AutoModelForImageTextToText,
AutoModelForTextToWaveform)
from transformers import (
AutoModel,
AutoModelForImageTextToText,
AutoModelForTextToWaveform,
)
from vllm.platforms import current_platform
from vllm.utils import identity
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
ImageTestAssets, VideoTestAssets, VllmRunner)
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
multi_gpu_marks)
from ....conftest import (
IMAGE_ASSETS,
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
VLMTestInfo, VLMTestType)
from .vlm_utils.types import (
CustomTestOptions,
ExpandableVLMTestArgs,
VLMTestInfo,
VLMTestType,
)
# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
@ -828,7 +841,7 @@ def _mark_splits(
new_test_settings = dict[str, VLMTestInfo]()
for i in range(num_groups):
models_in_group = models[i * split_size:(i + 1) * split_size]
models_in_group = models[i * split_size : (i + 1) * split_size]
for model in models_in_group:
for info in test_infos_by_model[model]:
@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
create_new_process_for_each_test=False,
))
),
)
def test_single_image_models(
tmp_path: PosixPath,
model_type: str,
@ -885,7 +899,8 @@ def test_single_image_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
create_new_process_for_each_test=False,
))
),
)
def test_multi_image_models(
tmp_path: PosixPath,
model_type: str,
@ -911,7 +926,8 @@ def test_multi_image_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
create_new_process_for_each_test=False,
))
),
)
def test_image_embedding_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -935,7 +951,8 @@ def test_image_embedding_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=False,
))
),
)
def test_video_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -959,7 +976,8 @@ def test_video_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=False,
))
),
)
def test_audio_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -983,7 +1001,8 @@ def test_audio_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
create_new_process_for_each_test=False,
))
),
)
def test_custom_inputs_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_single_image_models_heavy(
tmp_path: PosixPath,
@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_multi_image_models_heavy(
tmp_path: PosixPath,
@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_image_embedding_models_heavy(
model_type: str,
@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=True,
))
),
)
def test_video_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -1109,7 +1132,8 @@ def test_video_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=True,
))
),
)
def test_audio_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_custom_inputs_models_heavy(
model_type: str,

View File

@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
VllmRunner)
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
@ -64,50 +63,49 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request,
)
for prompts, audios in inputs
]
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id,
)
for prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(output) for output in vllm_outputs
],
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
name_0="hf",
name_1="vllm",
)
@ -118,9 +116,16 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, model: str,
audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")

View File

@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
give the same result.
"""
image_cherry = convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB")
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
images = [image_cherry, image_stop]
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
),
]
with vllm_runner(model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True) as vllm_model:
with vllm_runner(
model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy(prompts,
max_tokens,
images=images,
videos=videos)
vllm_model.generate_greedy(
prompts, max_tokens, images=images, videos=videos
)
for prompts, images, videos in inputs
]
all_results = [output[0][1] for output in vllm_outputs_per_case]
outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [
total_str[prompt_len:] for total_str, prompt_len in outputs
outputs = [
(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results
]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
interleaved_output_str, noninterleaved_output_str = generated_strs

View File

@ -18,13 +18,11 @@ from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig)
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
FullAttentionSpec)
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
from ....utils import multi_gpu_test
@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:
def create_reduced_maverick_model(
original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
@ -118,7 +115,8 @@ def create_reduced_maverick_model(
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers...")
f"{vision_layers} vision layers..."
)
# Create output directory
output_path = Path(output_dir)
@ -126,19 +124,23 @@ def create_reduced_maverick_model(
if force_recreate:
shutil.rmtree(output_path)
else:
print(f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite.")
print(
f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite."
)
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True)
original_config = AutoConfig.from_pretrained(
original_model_name, trust_remote_code=True
)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers)
reduced_config = create_reduced_config(
original_config, text_layers, num_experts, vision_layers
)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
@ -149,8 +151,7 @@ def create_reduced_maverick_model(
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config,
output_path)
create_reduced_safetensors(original_config, reduced_config, output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
@ -173,9 +174,9 @@ def create_reduced_maverick_model(
raise
def create_reduced_config(original_config: Any, text_layers: int,
num_experts: int,
vision_layers: int) -> dict[str, Any]:
def create_reduced_config(
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
print(
f"Reduced text layers from {original_text_layers} to {text_layers}"
)
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(
f"Reduced num experts from {original_num_experts} to {num_experts}"
)
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to "
f"{new_hidden_size}")
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"][
"num_hidden_layers"]
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} "
f"to {vision_layers}")
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = (
f"reduced_maverick_{text_layers}t_{vision_layers}v")
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
return config_dict
@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
original_model_name, trust_remote_code=True
)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any,
output_path: Path) -> None:
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
raise
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
Any],
output_path: Path) -> None:
def create_reduced_safetensors(
original_config: Any, reduced_config: dict[str, Any], output_path: Path
) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
@ -291,19 +283,18 @@ def create_text_model_weights(
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads",
num_attention_heads)
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert (num_experts
is not None), "num_local_experts must be specified for MoE"
assert num_experts is not None, "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16)
vocab_size, hidden_size, dtype=torch.float16
)
# Transformer layers
for layer_idx in range(num_layers):
@ -312,95 +303,105 @@ def create_text_model_weights(
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = (interleave_step > 0
and (layer_idx + 1) % interleave_step == 0)
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16)
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16
)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = (
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
# Expert weight scales (FP8 quantization)
weights[
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16)
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16
)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
torch.randn(hidden_size,
intermediate_size_mlp,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
)
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16)
vocab_size, hidden_size, dtype=torch.bfloat16
)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
vision_config: dict[str, Any],
) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
@ -414,47 +415,62 @@ def create_vision_model_weights(
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16)
intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
return weights
def create_shared_weights(
text_config: dict[str, Any],
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
text_config: dict[str, Any], vision_config: dict[str, Any]
) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
@ -464,13 +480,15 @@ def create_shared_weights(
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
)
return weights
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
output_path: Path) -> None:
def save_weights_to_safetensors(
weights: dict[str, torch.Tensor], output_path: Path
) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size":
sum(tensor.numel() * tensor.element_size()
for tensor in weights.values())
"total_size": sum(
tensor.numel() * tensor.element_size() for tensor in weights.values()
)
},
"weight_map": weight_map,
}
@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(f"Total model size: "
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
print(
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
)
def check_attention_spec_interleaved_rope(
@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
):
"""Check that the attention spec is correct."""
assert isinstance(llm.llm_engine.model_executor, Executor)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[
f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec)
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec,
)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=50)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
if should_profile:
llm.start_profile()
@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: "
f"{output.outputs[0].text}")
print(f"Output: {output.outputs[0].text}")
print("-" * 40)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@ -640,7 +656,8 @@ def main():
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model")
description="Create a reduced-layer Maverick model"
)
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
@ -652,10 +669,7 @@ def main():
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts",
type=int,
default=4,
help="Number of experts")
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
@ -667,12 +681,12 @@ def main():
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument("--test",
action="store_true",
help="Test the created model with vLLM")
parser.add_argument("--profile",
action="store_true",
help="Profile the created model with vLLM")
parser.add_argument(
"--test", action="store_true", help="Test the created model with vLLM"
)
parser.add_argument(
"--profile", action="store_true", help="Profile the created model with vLLM"
)
parser.add_argument(
"--test-original",
action="store_true",
@ -687,16 +701,18 @@ def main():
args = parser.parse_args()
if args.test:
test_dummy_maverick(original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile)
test_dummy_maverick(
original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile,
)
if args.test_original:
run_maverick_serving(args.original_model)

View File

@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
revision="refs/pr/70")
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
target_dtype = "half"
@ -48,8 +57,7 @@ if current_platform.is_rocm():
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
@ -75,28 +83,30 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
@ -108,17 +118,18 @@ def run_test(
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
@ -145,16 +156,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=16000)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")

View File

@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode, rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@ -71,8 +79,7 @@ if current_platform.is_rocm():
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
@ -98,27 +105,29 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
@ -127,42 +136,36 @@ def run_test(
pytest.skip("HF impl is not compatible with current transformers")
hf_model_kwargs = {"_attn_implementation": "sdpa"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
def patch_hf_processor(*args,
text="",
images=None,
audio=None,
sampling_rate=None,
**kwargs):
def patch_hf_processor(
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
):
audios = None
if audio is not None and sampling_rate is not None:
audios = [(audio, sampling_rate)]
return hf_processor(*args,
text=text,
images=images,
audios=audios,
**kwargs)
return hf_processor(
*args, text=text, images=images, audios=audios, **kwargs
)
hf_model.processor = patch_hf_processor
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
@ -189,16 +192,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None)
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")

View File

@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
return [{
"role":
"user",
"content": [{
"type": "text",
"text": PROMPT,
}] + [{
"type": "image_url",
"image_url": {
"url": url
}
} for url in urls],
}]
return [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT,
}
]
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
}
]
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
return [{
"role":
"user",
"content": [{
"type": "text",
"content": PROMPT,
}, *({
"type": "image",
"image": download_image(url)
} for url in urls)],
}]
return [
{
"role": "user",
"content": [
{
"type": "text",
"content": PROMPT,
},
*({"type": "image", "image": download_image(url)} for url in urls),
],
}
]
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text, [{
k: asdict(v)
for k, v in token_logprobs.items()
} for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs]
json_data = [
(
tokens,
text,
[
{k: asdict(v) for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])
],
)
for tokens, text, logprobs in outputs
]
with open(filename, "w") as f:
json.dump(json_data, f)
@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [(tokens, text, [{
int(k): Logprob(**v)
for k, v in token_logprobs.items()
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
return [
(
tokens,
text,
[
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
for token_logprobs in logprobs
],
)
for tokens, text, logprobs in json_data
]
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
local_asset_server) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
def test_chat(
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
check_logprobs_close(
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output",
)

View File

@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"baby_reading":
qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
})
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
@pytest.mark.core_model
@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
video_pruning_rate: float,
num_frames: int, dtype: str,
max_tokens: int) -> None:
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
videos = [sampled_vids[0]]
# Initialize model with EVS configuration
with vllm_runner(model,
runner="generate",
max_model_len=4000,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate) as vllm_model:
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts,
max_tokens,
videos=videos)
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got a response
assert len(outputs) == 1
@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
video_pruning_rate: float,
num_frames: int, dtype: str,
max_tokens: int) -> None:
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
) -> None:
"""Test EVS functionality with batched videos.
This test validates that:
@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
# Test batched videos
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
videos = [sampled_vids[0],
sampled_vids[0]] # Use same video twice for testing
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
# Initialize model with EVS configuration
with vllm_runner(model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate) as vllm_model:
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts,
max_tokens,
videos=videos)
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got responses for both videos
assert len(outputs) == 2

View File

@ -11,8 +11,13 @@ from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
VIDEO_ASSETS,
PromptImageInput,
PromptVideoInput,
VllmRunner,
)
from ...utils import check_logprobs_close
@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
})
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
}
)
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"baby_reading":
qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
})
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def batch_make_image_embeddings(
image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
image_batches: list[Union[Image.Image, list[Image.Image]]],
processor,
llm: VllmRunner,
) -> list[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
image_batches:
- Single-image batches: `list[Image.Image]`
- Multiple-image batches: `list[list[Image.Image]]]`
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
"""
@ -108,9 +116,9 @@ def batch_make_image_embeddings(
# image to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=images, return_tensors="pt") \
.data
preprocess_result = image_processor.preprocess(
images=images, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"]
@ -119,12 +127,13 @@ def batch_make_image_embeddings(
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(visual.device,
dtype=torch.int64)
return visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device).cpu()
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(
visual.device, dtype=torch.int64
)
return visual(
pixel_values_on_device, grid_thw=image_grid_thw_on_device
).cpu()
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
@ -137,16 +146,21 @@ def batch_make_image_embeddings(
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in image_grid_thw[image_counter:image_counter +
cur_batch_image_count])
for grid_thw in image_grid_thw[
image_counter : image_counter + cur_batch_image_count
]
)
result.append({
"image_embeds":
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"image_grid_thw":
image_grid_thw[image_counter:image_counter +
cur_batch_image_count],
})
result.append(
{
"image_embeds": image_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"image_grid_thw": image_grid_thw[
image_counter : image_counter + cur_batch_image_count
],
}
)
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
@ -160,13 +174,13 @@ def batch_make_image_embeddings(
def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor,
llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
video_batches: PromptVideoInput, processor, llm: VllmRunner
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
This will infer all videos' embeddings in a single batch,
This will infer all videos' embeddings in a single batch,
and split the result according to input batches.
video_batches:
@ -191,9 +205,9 @@ def batch_make_video_embeddings(
# video to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=None, videos=videos, return_tensors="pt") \
.data
preprocess_result = image_processor.preprocess(
images=None, videos=videos, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"]
@ -202,12 +216,13 @@ def batch_make_video_embeddings(
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(visual.device,
dtype=torch.int64)
return visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device).cpu()
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(
visual.device, dtype=torch.int64
)
return visual(
pixel_values_on_device, grid_thw=video_grid_thw_on_device
).cpu()
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
@ -220,16 +235,21 @@ def batch_make_video_embeddings(
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[video_counter:video_counter +
cur_batch_video_count])
for grid_thw in video_grid_thw[
video_counter : video_counter + cur_batch_video_count
]
)
result.append({
"video_embeds":
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"video_grid_thw":
video_grid_thw[video_counter:video_counter +
cur_batch_video_count],
})
result.append(
{
"video_embeds": video_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"video_grid_thw": video_grid_thw[
video_counter : video_counter + cur_batch_video_count
],
}
)
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
@ -263,25 +283,24 @@ def run_embedding_input_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={
"image": mm_limit,
"video": mm_limit
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None,
)
for prompts, images, videos in inputs
]
@ -290,17 +309,19 @@ def run_embedding_input_test(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=batch_make_image_embeddings(
images, processor, vllm_model) if images else None,
videos=batch_make_video_embeddings(
videos, processor, vllm_model) if videos else None)
images=batch_make_image_embeddings(images, processor, vllm_model)
if images
else None,
videos=batch_make_video_embeddings(videos, processor, vllm_model)
if videos
else None,
)
for prompts, images, videos in inputs
]
for outputs_for_original_input, \
outputs_for_embeddings_input \
in zip(outputs_per_case_for_original_input,
outputs_per_case_for_embeddings_input):
for outputs_for_original_input, outputs_for_embeddings_input in zip(
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
):
check_logprobs_close(
outputs_0_lst=outputs_for_original_input,
outputs_1_lst=outputs_for_embeddings_input,
@ -325,17 +346,26 @@ def run_embedding_input_test(
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
size_factors, dtype, max_tokens,
num_logprobs, monkeypatch) -> None:
def test_qwen2_vl_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype,
max_tokens,
num_logprobs,
monkeypatch,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[
list[str], PromptImageInput, PromptVideoInput]] = [(
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
[],
) for image, prompt in zip(images, IMAGE_PROMPTS)]
)
for image, prompt in zip(images, IMAGE_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
model, size_factors,
dtype: str, max_tokens: int,
num_logprobs: int) -> None:
def test_qwen2_vl_multiple_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput,
PromptVideoInput]] = [(
[MULTIIMAGE_PROMPT for _ in size_factors],
[[
rescale_image_size(image, factor)
for image in images
] for factor in size_factors],
[],
)]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[MULTIIMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
[],
)
]
run_embedding_input_test(
vllm_runner,
@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
size_factors, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
def test_qwen2_vl_video_embeddings_input(
vllm_runner,
video_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
num_frames = 4
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
inputs_per_case: list[tuple[
list[str], PromptImageInput, PromptVideoInput]] = [(
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[],
[rescale_video_size(video, factor) for factor in size_factors],
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
)
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
]
run_embedding_input_test(
vllm_runner,

View File

@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
"mary_had_lamb":
"Transcribe this into English.",
"winning_call":
"What is happening in this audio clip?",
})
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": "Transcribe this into English.",
"winning_call": "What is happening in this audio clip?",
}
)
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
"enable_chunked_prefill": True,
"max_num_seqs": 2,
# Use a very small limit to exercise chunked prefill.
"max_num_batched_tokens": 16
"max_num_batched_tokens": 16,
}
@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
for key, value in params_kwargs.items():
if isinstance(value, bool):
if value:
args.append(f"--{key.replace('_','-')}")
args.append(f"--{key.replace('_', '-')}")
else:
args.append(f"--{key.replace('_','-')}={value}")
args.append(f"--{key.replace('_', '-')}={value}")
return args
@pytest.fixture(params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
@pytest.fixture(
params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
]
)
def server(request, audio_assets: AudioTestAssets):
args = [
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
json.dumps({"audio": len(audio_assets)}),
"--trust-remote-code",
] + params_kwargs_to_cli_args(request.param)
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
placeholder = f"{placeholder}\n" * audio_count
return tokenizer.apply_chat_template([{
'role': 'user',
'content': f"{placeholder}{question}"
}],
tokenize=False,
add_generation_prompt=True)
return tokenizer.apply_chat_template(
[{"role": "user", "content": f"{placeholder}{question}"}],
tokenize=False,
add_generation_prompt=True,
)
def run_multi_audio_test(
@ -99,19 +104,21 @@ def run_multi_audio_test(
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio":
max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs) as vllm_model:
with vllm_runner(
model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio": max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
num_logprobs=num_logprobs,
audios=[audios for _, audios in prompts_and_audios])
audios=[audios for _, audios in prompts_and_audios],
)
# The HuggingFace model doesn't support multiple audios yet, so
# just assert that some tokens were generated.
@ -122,21 +129,25 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
VLLM_PLACEHOLDER)
@pytest.mark.parametrize(
"vllm_kwargs",
[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
],
)
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
vllm_kwargs: dict,
) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
messages = [{
"role":
"user",
"content": [
*[{
"type": "audio_url",
"audio_url": {
"url": audio.url
}
} for audio in audio_assets],
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
messages = [
{
"role": "user",
"content": [
*[
{"type": "audio_url", "audio_url": {"url": audio.url}}
for audio in audio_assets
],
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
}
]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]

View File

@ -6,8 +6,12 @@ import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
TextChunk, UserMessage)
from mistral_common.protocol.instruct.messages import (
AudioChunk,
RawAudio,
TextChunk,
UserMessage,
)
from vllm.transformers_utils.tokenizer import MistralTokenizer
@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
messages = [{
"role":
"user",
"content": [
*audio_chunks,
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
messages = [
{
"role": "user",
"content": [
*audio_chunks,
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
}
]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]

View File

@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [
{
"prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
@ -25,9 +24,8 @@ PROMPTS = [
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
}
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
},
]
EXPECTED = {
@ -41,7 +39,7 @@ EXPECTED = {
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five."
" by all five.",
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
@ -51,7 +49,7 @@ EXPECTED = {
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
" just continues. My, oh my.",
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
@ -62,7 +60,7 @@ EXPECTED = {
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
" my.",
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
@ -73,7 +71,7 @@ EXPECTED = {
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
" my.",
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
@ -84,8 +82,8 @@ EXPECTED = {
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
]
" my.",
],
}
@ -100,11 +98,11 @@ def run_test(
expected_list = EXPECTED[model] * 10
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.llm

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types.
"""
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Iterable
from pathlib import PosixPath
from typing import Callable, Optional, Union
@ -10,20 +10,30 @@ import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
VLMTestInfo)
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
str],
test_placeholder: str) -> str:
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
return img_prompt
def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> list[str]:
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
img_idx_to_prompt,
TEST_IMG_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
video_idx_to_prompt,
TEST_VIDEO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
audio_idx_to_prompt,
TEST_AUDIO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build single image inputs")
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
def build_single_image_inputs(
images, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
@ -125,7 +138,8 @@ def build_single_image_inputs(
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
) for image, prompt in zip(images, model_prompts)
)
for image, prompt in zip(images, model_prompts)
]
@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build multi image inputs")
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts([test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
def build_multi_image_inputs(
image_lists, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
] for size in size_wrapper.data],
) for images, prompt in zip(image_lists, model_prompts)
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
all(factor == 1.0 for factor in size_wrapper.data):
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
for asset in video_assets
]
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
else rescale_video_size)
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
video_scaler(video, size) for size in size_wrapper.data
],
) for video, prompt in zip(sampled_vids, model_prompts)
video_data=[video_scaler(video, size) for size in size_wrapper.data],
)
for video, prompt in zip(sampled_vids, model_prompts)
]
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
size_type: SizeType):
def apply_image_size_scaling(
image, size: Union[float, tuple[int, int]], size_type: SizeType
):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
) for audio, sr in audios]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(

View File

@ -4,19 +4,28 @@
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
@ -25,7 +34,8 @@ def get_filtered_test_settings(
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
@ -36,62 +46,69 @@ def get_filtered_test_settings(
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert (test_info.custom_test_opts is not None
and isinstance(test_info.custom_test_opts, Iterable))
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (test_info.distributed_executor_backend
is not None) == new_proc_per_test:
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool):
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test)
test_settings, test_type, create_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict([
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
("distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend)),
])
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(
test_info.num_video_frames)
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(
f"Sizes must be set for test type {test_type}")
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
#Otherwise expand the custom test options instead
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
def get_wrapped_test_sizes(
test_info: VLMTestInfo,
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple([
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
])
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors \
if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes \
if test_info.image_sizes else []
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
for size in fixed_sizes
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from typing import Any, Callable, Optional
import torch
@ -70,22 +71,23 @@ def run_test(
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.skip_tokenizer_init:
vllm_runner_kwargs_[
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_) as vllm_model:
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
@ -95,21 +97,19 @@ def run_test(
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data)
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(model,
dtype=dtype,
auto_cls=auto_cls,
model_kwargs=hf_model_kwargs)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
@ -129,16 +129,15 @@ def run_test(
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data)
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
@ -150,8 +149,7 @@ def run_test(
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
vllm_outputs_per_mm):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
@ -171,15 +169,19 @@ def process_runner_outputs(
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(first_runner_processor, model,
first_runner_outputs)
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(second_runner_processor, model,
second_runner_outputs)
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image]
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@ -1,12 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from typing import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183))
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
]
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
num_frames: int = 16):
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
resize_video(video, (488, 183)),
],
video,
]
@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
) # noqa: E501
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [[(rescale_video_size(video_array, scale), metadata)]
for scale in scales]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [

View File

@ -4,6 +4,7 @@
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import types
from pathlib import PosixPath
from typing import Optional, Union
@ -15,8 +16,13 @@ import pytest
import regex as re
import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig, GenerationMixin)
from transformers import (
AutoConfig,
AutoTokenizer,
BatchFeature,
GenerationConfig,
GenerationMixin,
)
from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs
@ -27,8 +33,7 @@ from .types import RunnerOutput
####### vLLM output processors functions
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
def qwen_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
def qwen2_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
def kimiv_vl_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_image_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.image_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def llava_video_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.video_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
mm_token_id: int) -> RunnerOutput:
def _llava_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str, mm_token_id: int
) -> RunnerOutput:
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
]
@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
return config.to_dict()
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_onevision_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
]
@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [mantis] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
return output_ids, hf_output_str, out_logprobs
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
]
@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
####### Post-processors for HF outputs
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end▁of▁sentence>"):
output_str = output_str.split("<end▁of▁sentence>")[0]
return output_ids, output_str, out_logprobs
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_utterance>"):
output_str = output_str.split("<end_of_utterance>")[0]
return output_ids, output_str, out_logprobs
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
# Based on Idefics3
return idefics3_trunc_hf_output(hf_output, model)
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_sentence>"):
output_str = output_str.split("<end_of_sentence>")[0]
return output_ids, output_str, out_logprobs
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
tokenizer = AutoTokenizer.from_pretrained(model)
@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str,
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
) -> str:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return BatchFeature(data=inputs, tensor_type="pt")
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language.model.embed_tokens
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language.model.embed_tokens
)
return hf_model
@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
assert len(contents) == len(images)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": image,
"content": content
} for image, content in zip(images, contents)],
[
{"role": "user", "image": image, "content": content}
for image, content in zip(images, contents)
],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.transformer.output_layer
)
return hf_model
@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
else:
video_metadata = None
return hf_processor(*args,
videos=videos,
video_metadata=video_metadata,
**kwargs)
return hf_processor(
*args, videos=videos, video_metadata=video_metadata, **kwargs
)
hf_model.processor = processor
return hf_model
@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.use_msac = self.config.use_msac
@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_h2ovl,
)
# yapf: enable
images = [images] if isinstance(images, Image) else images
@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
use_msac=self.use_msac,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = H2OVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_skyworkr1v)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
)
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = SkyworkR1VProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
)
for image in images
]
num_patches_images = [
pixel_value.shape[0] for pixel_value in pixel_values_images
@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=1,
max_num=1,
use_thumbnail=False,
) for video in videos
)
for video in videos
]
num_patches_videos = [
pixel_value.shape[0] for pixel_value in pixel_values_videos
@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
while ("<image>" in text) or ("<video>" in text):
image_index = text.find("<image>")
video_index = text.find("<video>")
if image_index == -1 or (video_index > -1
and video_index < image_index):
if image_index == -1 or (
video_index > -1 and video_index < image_index
):
num_patches = num_patches_videos.pop(0)
pixel_values.append(pixel_values_videos.pop(0))
context_tokens = IMG_START + \
IMG_CONTEXT * self.num_image_token + IMG_END
video_tokens = ''.join([
f'Frame{i+1}: {context_tokens}'
for i in range(num_patches)
])
text = text.replace('<video>', video_tokens, 1)
context_tokens = (
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
)
video_tokens = "".join(
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
)
text = text.replace("<video>", video_tokens, 1)
else:
num_patches = num_patches_images.pop(0)
pixel_values.append(pixel_values_images.pop(0))
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
pixel_values = torch.cat(pixel_values, dim=0)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = InternVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@ -631,7 +641,7 @@ def _internvl_generate(
input_embeds = input_embeds.reshape(B * N, C)
input_ids = input_ids.reshape(B * N)
selected = (input_ids == self.img_context_token_id)
selected = input_ids == self.img_context_token_id
assert selected.sum() != 0
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, **kwargs):
text_tokenizer = hf_model.model.get_text_tokenizer()
@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
break
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
text_or_conversations=text, images=images)
text_or_conversations=text, images=images
)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
inputs = {
@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, videos=None, **kwargs):
if images is None:
@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos = []
else:
videos = [videos] if isinstance(videos, np.ndarray) else videos
videos = [[PIL.Image.fromarray(frame) for frame in vid]
for vid in videos]
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
images_message = [{"type": "image", "image": img} for img in images]
videos_message = [{"type": "video", "video": vid} for vid in videos]
messages = [{
"role":
"user",
"content": [
*images_message,
*videos_message,
{
"type": "text",
"text": text
},
],
}]
messages = [
{
"role": "user",
"content": [
*images_message,
*videos_message,
{"type": "text", "text": text},
],
}
]
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
messages=messages, enable_thinking=True)
messages=messages, enable_thinking=True
)
inputs = {
"inputs": input_ids,
"pixel_values": pixel_values,

View File

@ -3,23 +3,34 @@
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
VideoTestAssets, VllmRunner)
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper)
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
@ -90,8 +113,11 @@ def run_video_test(
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info, video_assets, test_case.size_wrapper,
test_case.num_video_frames)
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
)
core.run_test(
hf_runner=hf_runner,
@ -103,7 +129,8 @@ def run_video_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
@ -114,8 +141,7 @@ def run_audio_test(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(
model_test_info, audio_assets)
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
@ -127,13 +153,17 @@ def run_audio_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner]):
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Iterable
from enum import Enum
from pathlib import PosixPath
@ -15,9 +16,16 @@ from vllm.config import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
ImageTestAssets, PromptAudioInput, PromptImageInput,
PromptVideoInput)
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: Optional[PromptImageInput] = None
video_data: Optional[PromptVideoInput] = None
@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
list[torch.Tensor]]] = None
convert_assets_to_embeddings: Optional[
Callable[[ImageTestAssets], list[torch.Tensor]]
] = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
str]] = None # noqa: E501
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
] = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[list[CustomTestOptions]] = None
@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int

View File

@ -12,10 +12,12 @@ HF_TEXT_PROMPTS = [
"a photo of a cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "",
"cherry_blossom": "",
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "",
"cherry_blossom": "",
}
)
MODELS = ["openai/clip-vit-base-patch32"]
@ -33,11 +35,9 @@ def _run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=77) as vllm_model:
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
@ -48,10 +48,12 @@ def _run_test(
if "pixel_values" in inputs:
inputs.pop("input_ids")
pooled_output = hf_model.model.get_image_features(
**hf_model.wrap_device(inputs)).squeeze(0)
**hf_model.wrap_device(inputs)
).squeeze(0)
else:
pooled_output = hf_model.model.get_text_features(
**hf_model.wrap_device(inputs)).squeeze(0)
**hf_model.wrap_device(inputs)
).squeeze(0)
all_outputs.append(pooled_output.tolist())
@ -98,8 +100,7 @@ def test_models_image(
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
@ -125,11 +126,9 @@ def test_models_text_image_no_crash(
texts = [HF_TEXT_PROMPTS[0]]
images = [image_assets[0].pil_image]
with vllm_runner(model,
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=77) as vllm_model:
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)

View File

@ -17,18 +17,21 @@ HF_TEXT_PROMPTS = [
# T -> X
(
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
Image.new("RGB", (56, 56))),
Image.new("RGB", (56, 56)),
),
# T -> X
("Query: Retrieve an image of this caption: cherry blossom",
Image.new("RGB", (56, 56))),
(
"Query: Retrieve an image of this caption: cherry blossom",
Image.new("RGB", (56, 56)),
),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"What is shown in this image?",
"cherry_blossom":
"What is shown in this image?"
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "What is shown in this image?",
"cherry_blossom": "What is shown in this image?",
}
)
MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
@ -36,34 +39,30 @@ MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
def get_messages(image: Image.Image, text: str, embed_text: bool):
# assert False, 'remember to use outer [] as required'
if embed_text:
messages = [{
"role":
"user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (56, 56)),
"resized_height": 1,
"resized_width": 1
}, # need a dummy image here for an easier process.
{
"type": "text",
"text": text
},
]
}]
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (56, 56)),
"resized_height": 1,
"resized_width": 1,
}, # need a dummy image here for an easier process.
{"type": "text", "text": text},
],
}
]
else:
messages = [{
"role":
"user",
"content": [{
"type": "image",
"image": image
}, {
"type": "text",
"text": text
}]
}]
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
],
}
]
return messages
@ -71,8 +70,10 @@ def apply_chat_template_and_add_eos(
messages: list[dict],
apply_chat_template_fn: Callable,
):
prompt = apply_chat_template_fn(
messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
prompt = (
apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
+ "<|endoftext|>"
)
return prompt
@ -86,16 +87,14 @@ def _run_test(
*,
dtype: str,
) -> None:
'''SET PYTHONPATH'''
"""SET PYTHONPATH"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
texts = [
# this is necessary because vllm_model.embed will not apply any
@ -105,25 +104,25 @@ def _run_test(
apply_chat_template_and_add_eos(
get_messages(image, text, False),
apply_chat_template_fn=tokenizer.apply_chat_template,
) for text, image in zip(input_texts, input_images)
)
for text, image in zip(input_texts, input_images)
# vllm will replace the pad token with the actual image,
# which may be a placeholder image, later.
]
vllm_outputs = vllm_model.embed(texts, images=input_images)
hf_outputs = []
with hf_runner(model,
dtype=dtype,
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
) as hf_model:
prompts = []
for text, image, embed_text in zip(input_texts, input_images,
embed_texts):
for text, image, embed_text in zip(input_texts, input_images, embed_texts):
# dse requires non-standard input processing
# because it needs an image_pad token
messages = get_messages(image, text, embed_text)
prompt = apply_chat_template_and_add_eos(
messages, hf_model.processor.apply_chat_template)
messages, hf_model.processor.apply_chat_template
)
prompts.append(prompt)
@ -145,9 +144,9 @@ def _run_test(
return_dict=True,
output_hidden_states=True,
)
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
p=2,
dim=-1)
pooled_output = F.normalize(
outputs.hidden_states[-1][0, -1], p=2, dim=-1
)
all_outputs.append(pooled_output.tolist())
@ -170,8 +169,9 @@ def test_models_text(
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, image_placeholder)
for text, image_placeholder in HF_TEXT_PROMPTS]
input_texts_images = [
(text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
embed_texts = [True] * len(input_texts)
@ -198,8 +198,7 @@ def test_models_image(
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]

View File

@ -29,7 +29,7 @@ def run_intern_vit_test(
img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets]
pixel_values = [
img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
for images in images
]
@ -37,15 +37,16 @@ def run_intern_vit_test(
if not getattr(config, "norm_type", None):
config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(model,
torch_dtype=torch_dtype,
trust_remote_code=True).to("cuda")
hf_model = AutoModel.from_pretrained(
model, torch_dtype=torch_dtype, trust_remote_code=True
).to("cuda")
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state
for pixel_value in pixel_values
]
from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items())
@ -54,22 +55,23 @@ def run_intern_vit_test(
vllm_model = vllm_model.to("cuda", torch_dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda"))
for pixel_value in pixel_values
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
]
del vllm_model
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image,
hf_outputs_per_image):
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize("model_id", [
"OpenGVLab/InternViT-300M-448px",
"OpenGVLab/InternViT-6B-448px-V1-5",
])
@pytest.mark.parametrize(
"model_id",
[
"OpenGVLab/InternViT-300M-448px",
"OpenGVLab/InternViT-6B-448px-V1-5",
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
run_intern_vit_test(

View File

@ -29,7 +29,6 @@ def vllm_reranker(
query_type: str = "text",
doc_type: str = "text",
):
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
return {"type": "image_url", "image_url": {"url": f"{url}"}}
@ -38,23 +37,25 @@ def vllm_reranker(
query = query_strs
elif query_type == "image":
query = ScoreMultiModalParam(
content=[create_image_param(url) for url in query_strs])
content=[create_image_param(url) for url in query_strs]
)
documents: Union[list[str], ScoreMultiModalParam]
if doc_type == "text":
documents = document_strs
elif doc_type == "image":
documents = ScoreMultiModalParam(
content=[create_image_param(url) for url in document_strs])
content=[create_image_param(url) for url in document_strs]
)
with vllm_runner(
model_name,
runner="pooling",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
model_name,
runner="pooling",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model:
outputs = vllm_model.llm.score(query, documents)
@ -78,16 +79,15 @@ def hf_reranker(
data_pairs = [[query_strs[0], d] for d in document_strs]
with hf_runner(
model_name,
dtype=dtype,
trust_remote_code=True,
auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
model_name,
dtype=dtype,
trust_remote_code=True,
auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
) as hf_model:
return hf_model.model.compute_score(data_pairs,
max_length=2048,
query_type=query_type,
doc_type=doc_type)
return hf_model.model.compute_score(
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
)
# Visual Documents Reranking
@ -100,10 +100,12 @@ def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"text", "image")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "text", "image")
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "image"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -127,10 +129,12 @@ def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"text", "text")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "text", "text")
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "text"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -157,10 +161,12 @@ def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"image", "text")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "image", "text")
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "text"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -178,10 +184,12 @@ def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"image", "image")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "image", "image")
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "image"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)

View File

@ -24,9 +24,10 @@ from ...utils import check_embeddings_close
# built with LAPACK support.
pytestmark = pytest.mark.skipif(
not current_platform.is_cuda(),
reason="Llava Next model uses op that is only supported in CUDA")
reason="Llava Next model uses op that is only supported in CUDA",
)
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
HF_TEXT_PROMPTS = [
# T -> X
@ -34,18 +35,21 @@ HF_TEXT_PROMPTS = [
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
),
# T -> X
llama3_template.format(
"cherry blossom\nSummary above sentence in one word: "),
llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
# I -> X
"stop_sign":
llama3_template.format("<image>\nSummary above image in one word: "),
# I -> X
"cherry_blossom":
llama3_template.format("<image>\nSummary above image in one word: "),
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
# I -> X
"stop_sign": llama3_template.format(
"<image>\nSummary above image in one word: "
),
# I -> X
"cherry_blossom": llama3_template.format(
"<image>\nSummary above image in one word: "
),
}
)
MODELS = ["royokong/e5-v"]
@ -63,23 +67,22 @@ def _run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True) as vllm_model:
with vllm_runner(
model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForImageTextToText) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForImageTextToText
) as hf_model:
# Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = \
hf_model.model.config.vision_config.patch_size
hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
# Patch the issue where image_token_id
# exceeds the maximum allowed vocab size
hf_model.model.resize_token_embeddings(
hf_model.model.language_model.vocab_size + 1)
hf_model.model.language_model.vocab_size + 1
)
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
@ -91,8 +94,7 @@ def _run_test(
return_dict=True,
output_hidden_states=True,
)
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
dim=-1)
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
all_outputs.append(pooled_output.tolist())
@ -142,8 +144,7 @@ def test_models_image(
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]

View File

@ -19,14 +19,14 @@ HF_TEXT_PROMPTS = [
"Retrieve an image of this caption: cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
# T + I -> X
"stop_sign":
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
# I -> X
"cherry_blossom":
"<|image_1|> Represent the given image for classification", # noqa: E501
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
# T + I -> X
"stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
# I -> X
"cherry_blossom": "<|image_1|> Represent the given image for classification", # noqa: E501
}
)
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
@ -44,14 +44,14 @@ def _run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, runner="pooling", dtype=dtype,
enforce_eager=True) as vllm_model:
with vllm_runner(
model, runner="pooling", dtype=dtype, enforce_eager=True
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
@ -114,18 +114,21 @@ def test_models_image(
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
# add cases for special_tokens
input_texts_images.append((
"\n<s><|user|>\n <|image_1|>\n\t <s>"
"Represent the given image for classification<|end|>"
"\n<|assistant|>\n",
Image.open(
get_vllm_public_assets(filename="cherry_blossom.jpg",
s3_prefix=VLM_IMAGES_DIR)),
))
input_texts_images.append(
(
"\n<s><|user|>\n <|image_1|>\n\t <s>"
"Represent the given image for classification<|end|>"
"\n<|assistant|>\n",
Image.open(
get_vllm_public_assets(
filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
)
),
)
)
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]

View File

@ -19,25 +19,25 @@ def _run_test(
vllm_runner: type[VllmRunner],
model: str,
) -> None:
prompt = [
{
# This model deals with no text input
"prompt_token_ids": [1],
"multi_modal_data": generate_test_mm_data(),
} for _ in range(10)
}
for _ in range(10)
]
with vllm_runner(
model,
runner="pooling",
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
default_torch_num_threads=1,
model,
runner="pooling",
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
default_torch_num_threads=1,
) as vllm_model:
vllm_model.encode(prompt)

View File

@ -34,9 +34,9 @@ def run_radio_test(
# Using `self.get_nearest_supported_resolution`, for assets 432x642 the
# nearest supported resolution is 432x640.
pixel_values = [
img_processor(
image,
return_tensors='pt').pixel_values.to(torch_dtype)[:, :, :, :640]
img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
:, :, :, :640
]
for image in images
]
@ -51,32 +51,33 @@ def run_radio_test(
hf_model.eval()
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).features
for pixel_value in pixel_values
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
]
radio_config = RadioConfig(model_name=config.args["model"],
reg_tokens=config.args["register_multiple"])
radio_config = RadioConfig(
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
)
vllm_model = RadioModel(radio_config)
vllm_model.load_weights(hf_model.state_dict())
vllm_model = vllm_model.to("cuda", torch_dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda"))
for pixel_value in pixel_values
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
]
del vllm_model, hf_model
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image,
hf_outputs_per_image):
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize("model_id", [
"nvidia/C-RADIOv2-H",
])
@pytest.mark.parametrize(
"model_id",
[
"nvidia/C-RADIOv2-H",
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
run_radio_test(

View File

@ -6,22 +6,27 @@ from typing import Optional, Union
import numpy as np
import pytest
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import (BaseMultiModalProcessor,
InputProcessingContext)
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens)
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.transformers_utils.tokenizer import (
AnyTokenizer,
MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens,
)
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
num_frames = len(video)
mm_data["video"] = (video, {
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
})
mm_data["video"] = (
video,
{
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
},
)
return mm_data
@ -102,7 +110,8 @@ def _test_processing_correctness(
mm_processor_cache_gb=2048,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@ -145,27 +154,22 @@ def _test_processing_correctness(
input_to_hit = {
"image": Image.new("RGB", size=(128, 128)),
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
"audio": (np.zeros((512, )), 16000),
"audio": (np.zeros((512,)), 16000),
}
input_factory = {
"image":
partial(random_image, rng, min_wh=128, max_wh=256),
"video":
partial(random_video,
rng,
min_frames=2,
max_frames=16,
min_wh=128,
max_wh=256),
"audio":
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
"image": partial(random_image, rng, min_wh=128, max_wh=256),
"video": partial(
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
),
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
for batch_idx in range(num_batches):
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))]
k: [
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))
]
for k, limit in limit_mm_per_prompt_ints.items()
}
@ -174,12 +178,16 @@ def _test_processing_correctness(
# Mistral chat outputs tokens directly, rather than text prompts
if isinstance(tokenizer, MistralTokenizer):
images = mm_data.get("image", [])
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]),
])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
res = tokenizer.mistral.encode_chat_completion(request)
prompt = res.tokens
else:
@ -303,16 +311,14 @@ def _test_processing_correctness_one(
baseline_text_result,
baseline_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
cached_text_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)

View File

@ -24,7 +24,8 @@ from ...utils import build_model_context
# post-sampled frames (expected behavior)
(-1, 1, 5),
(-1, 2, 10),
])
],
)
def test_processor_override(
model_id: str,
expected_toks_per_frame: int,
@ -55,10 +56,8 @@ def test_processor_override(
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
video_tok_count = processed_inputs["prompt_token_ids"].count(
video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
)["video_grid_thw"][0]
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@ -71,7 +70,7 @@ def test_video_loader_consistency(
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
@ -91,7 +90,8 @@ def test_video_loader_consistency(
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, fps=fps)
video_bytes, fps=fps
)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
@ -99,12 +99,11 @@ def test_video_loader_consistency(
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data,
hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
hf_processor_mm_kwargs)
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
"prompt_token_ids"]
assert static_outputs["mm_kwargs"].get_data(
) == dynamic_outputs["mm_kwargs"].get_data()
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
assert (
static_outputs["mm_kwargs"].get_data()
== dynamic_outputs["mm_kwargs"].get_data()
)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@ -23,8 +24,10 @@ def _get_expected_num_patches(
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
from vllm.model_executor.models.h2ovl import (
calculate_h2ovl_targets,
get_h2ovl_target_ratios,
)
width, height = image.size
@ -101,24 +104,27 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
])
@pytest.mark.parametrize(
"model_id",
[
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
)
@pytest.mark.parametrize(
"size_factors",
[
@ -165,10 +171,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Idefics3's multimodal preprocessing kwargs."""
import pytest
from transformers import Idefics3Config
@ -17,7 +18,8 @@ from ...utils import build_model_context
[
({"size": {"longest_edge": 364}}, 169),
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -42,8 +44,11 @@ def test_processor_override(
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
@ -57,8 +62,7 @@ def test_processor_override(
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for InternVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@ -24,7 +25,9 @@ def _get_expected_num_patches(
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
calculate_internvl_targets,
get_internvl_target_ratios,
)
width, height = image.size
@ -61,15 +64,15 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@ -122,10 +125,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id",
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@ -38,13 +37,14 @@ def test_processor_override(
hf_processor = processor.info.get_hf_processor()
vocab = tokenizer.get_vocab()
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+ "<|image|>" * num_imgs \
prompt = (
"<|begin_of_text|><|header_start|>user<|header_end|>"
+ "<|image|>" * num_imgs
+ "<|eot|><|header_start|>assistant<|header_end|>"
)
mm_data = {
"image": [
image_assets[(i % len(image_assets))].pil_image
for i in range(num_imgs)
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
]
}
if tokenized_prompt:
@ -64,22 +64,23 @@ def test_processor_override(
if tiles_x * tiles_y > 1:
num_x_separators += (tiles_x - 1) * tiles_y
num_y_separators += tiles_y
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
== num_x_separators
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
== num_y_separators
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
assert (
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
== num_y_separators
)
# image token offsets
img_locs = processed_inputs["mm_placeholders"].get("image", [])
assert len(img_locs) == num_imgs
assert [img_loc.offset for img_loc in img_locs] == \
[i for i, v in enumerate(prompt_token_ids) \
if v == config.boi_token_index]
assert [img_loc.offset for img_loc in img_locs] == [
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
]
# patch sizes and masks
num_patches_per_chunk = processor.info.get_patch_per_chunk(
config.vision_config)
assert prompt_token_ids.count(config.image_token_index) \
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
assert (
prompt_token_ids.count(config.image_token_index)
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
assert len(mm_data["pixel_values"]) \
== sum(mm_data["patches_per_image"])
)
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])

View File

@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
# NOTE: There is a BOS token
assert first_placeholder.offset == 1
assert first_placeholder.length == (
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
assert (
first_placeholder.length
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):

View File

@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_id,
@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
first_placeholder = image_placeholders[0]
assert first_placeholder.offset == 0
assert first_placeholder.length == len(
processed_inputs["prompt_token_ids"]) // num_imgs
assert (
first_placeholder.length
== len(processed_inputs["prompt_token_ids"]) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(

View File

@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
failed_size_excs = list[tuple[ImageSize, Exception]]()
for size in image_sizes:
_validate_image_prompt_replacements_one(processor, num_imgs,
failed_size_excs, size)
_validate_image_prompt_replacements_one(
processor, num_imgs, failed_size_excs, size
)
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for mllama's multimodal preprocessing and profiling."""
import pytest
from torch import prod
from transformers import Llama4Config
@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
image_size = hf_config.vision_config.image_size
patch_size = hf_config.vision_config.patch_size
downsample_ratio = int(
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
)
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
chunks_per_image = prod(mm_data["patches_per_image"])
total_num_patches = chunks_per_image * tokens_per_patch
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
1] # x-y separator tokens
total_tokens = total_num_patches.item() + num_tiles.item(
) + 3 # image start, image, image end
num_tiles = (
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
) # x-y separator tokens
total_tokens = (
total_num_patches.item() + num_tiles.item() + 3
) # image start, image, image end
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
max_model_len,
@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):
assert total_tokens == profiled_tokens["image"]
assert total_tokens == sum(
placeholder.length for placeholder in
decoder_dummy_data.multi_modal_placeholders["image"])
placeholder.length
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@ -24,7 +25,9 @@ def _get_expected_num_patches(
max_num: int,
):
from vllm.model_executor.models.nemotron_vl import (
calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
calculate_nemotron_vl_targets,
get_nemotron_vl_target_ratios,
)
width, height = image.size
@ -63,22 +66,21 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
print(total_expected_num_patches)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id",
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
"size_factors",
[
@ -125,10 +127,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi3v's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
@ -18,7 +19,8 @@ from ...utils import build_model_context
({"num_crops": 16}, 1921),
# the default num_crops of phi-3.5-vision is 4
({}, 757),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi4mm's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
@ -18,7 +19,8 @@ from ...utils import build_model_context
({"dynamic_hd": 16}, 4433),
# the default num_crops of phi-4-multimodal is 36
({}, 9585),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -46,8 +48,7 @@ def test_processor_override(
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
image_size = ctx.get_hf_config(
).embd_layer["image_embd_layer"]["crop_size"]
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
dummy_image_size = (image_size * 7, image_size * 7)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
@ -56,5 +57,6 @@ def test_processor_override(
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(
_IMAGE_PLACEHOLDER_TOKEN_ID)
_IMAGE_PLACEHOLDER_TOKEN_ID
)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@ -12,10 +12,12 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
[
({}, 1426, (5704, 1176)),
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -48,8 +50,7 @@ def test_processor_override(
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for smolvlm's multimodal preprocessing kwargs."""
import pytest
from transformers import SmolVLMConfig
@ -17,7 +18,8 @@ from ...utils import build_model_context
[
({"max_image_size": {"longest_edge": 384}}, 1377),
({"max_image_size": {"longest_edge": 768}}, 405),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -42,8 +44,11 @@ def test_processor_override(
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
@ -57,8 +62,7 @@ def test_processor_override(
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id

View File

@ -9,23 +9,29 @@ from typing import Any, Union
import numpy as np
import pytest
import torch.nn as nn
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.distributed import (
cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
supports_multimodal)
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
supports_multimodal,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import (BaseMultiModalProcessor,
InputProcessingContext)
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils import is_list_of
@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
}
ImageInput = list[Image.Image]
VideoInput = Union[list[Image.Image], list[np.ndarray],
list[tuple[np.ndarray, dict[str, Any]]]]
VideoInput = Union[
list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
]
AudioInput = list[tuple[np.ndarray, int]]
def _resize_data(_data: Union[Image.Image, np.ndarray],
size_factor: float) -> Union[Image.Image, np.ndarray]:
def _resize_data(
_data: Union[Image.Image, np.ndarray], size_factor: float
) -> Union[Image.Image, np.ndarray]:
assert size_factor <= 1, "Size factor must be less than 1"
# Image input
if isinstance(_data, Image.Image):
@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
return _data[..., :T, :H, :W, :C]
# Audio input
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
return _data[:int(len(_data) * size_factor)]
return _data[: int(len(_data) * size_factor)]
raise AssertionError("This line should be unreachable.")
def resize_mm_data(
data: Union[ImageInput, VideoInput, AudioInput],
size_factors: tuple[float,
...]) -> Union[ImageInput, VideoInput, AudioInput]:
size_factors = size_factors[:len(data)]
data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
) -> Union[ImageInput, VideoInput, AudioInput]:
size_factors = size_factors[: len(data)]
if is_list_of(data, (Image.Image, np.ndarray, list)):
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
elif is_list_of(data, tuple):
return [(_resize_data(d, s), meta)
for (d, meta), s in zip(data, size_factors)]
return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
raise ValueError("Unsupported multimodal data type.")
@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
# Mistral chat outputs tokens directly, rather than text prompts
if model_config.tokenizer_mode == "mistral":
images = resized_mm_data.get("image", [])
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]),
])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
tokenizer = processing_info.get_tokenizer()
res = tokenizer.mistral.encode_chat_completion(request)
prompt = res.tokens
@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
tokenization_kwargs=processor_inputs.tokenization_kwargs,
)["mm_kwargs"].require_data()
items = [
item for modality in supported_mm_limits
for item in mm_kwargs[modality]
]
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
return group_mm_kwargs_by_modality(
items,
merge_by_field_config=model_cls.merge_by_field_config,
@ -167,15 +174,17 @@ def initialize_dummy_model(
cleanup_dist_env_and_memory()
def get_model_id_to_test(
model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
filtered_results = []
for model_arch in model_arch_list:
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
available_repos = list(
map(lambda model_id: (model_arch, model_id),
[model_info.default, *model_info.extras.values()]))
map(
lambda model_id: (model_arch, model_id),
[model_info.default, *model_info.extras.values()],
)
)
filtered_results.extend(available_repos)
else:
filtered_results.append((model_arch, model_info.default))
@ -183,8 +192,8 @@ def get_model_id_to_test(
@pytest.mark.parametrize(
"model_arch, model_id",
get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
"model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
)
def test_model_tensor_schema(model_arch: str, model_id: str):
if model_arch in ARCH_TO_SKIP:
pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip",
check_max_version=False)
model_info.check_transformers_version(on_fail="skip", check_max_version=False)
hf_overrides_fn = partial(dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides)
hf_overrides_fn = partial(
dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides,
)
model_config = ModelConfig(
model_id,
@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
with initialize_dummy_model(model_cls, model_config) as model:
for modality, _, mm_kwargs in create_batched_mm_kwargs(
model_cls, model_config, processor):
model_cls, model_config, processor
):
for method_name in inputs_parse_methods:
print(f"Testing `{method_name}` with modality={modality} "
f"and mm_kwargs{list(mm_kwargs.keys())}")
print(
f"Testing `{method_name}` with modality={modality} "
f"and mm_kwargs{list(mm_kwargs.keys())}"
)
getattr(model, method_name)(modality=modality, **mm_kwargs)

View File

@ -19,7 +19,7 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
"""Create weights from safetensors checkpoint metadata"""
metadata = try_get_safetensors_metadata(repo)
weight_names = list(metadata.weight_map.keys())
with torch.device('meta'):
with torch.device("meta"):
return ((name, torch.empty(0)) for name in weight_names)
@ -61,7 +61,8 @@ def test_hf_model_weights_mapper(model_arch: str):
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
@ -83,6 +84,7 @@ def test_hf_model_weights_mapper(model_arch: str):
weights_missing = ref_weight_names - weight_names
weights_unmapped = weight_names - ref_weight_names
assert (not weights_missing and not weights_unmapped), (
assert not weights_missing and not weights_unmapped, (
f"Following weights are not mapped correctly: {weights_unmapped}, "
f"Missing expected weights: {weights_missing}.")
f"Missing expected weights: {weights_missing}."
)

View File

@ -11,12 +11,12 @@ from vllm.multimodal.image import rescale_image_size
from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
from ..utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
"cherry_blossom":
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
"cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
}
)
def run_awq_test(
@ -34,10 +34,13 @@ def run_awq_test(
):
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
@ -46,42 +49,41 @@ def run_awq_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
) as vllm_model:
source_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, images=images
)
for prompts, images in inputs_per_image
]
with vllm_runner(
quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
) as vllm_model:
quant_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, images=images
)
for prompts, images in inputs_per_image
]
for source_outputs, quant_outputs in zip(source_outputs_per_image,
quant_outputs_per_image):
for source_outputs, quant_outputs in zip(
source_outputs_per_image, quant_outputs_per_image
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
@ -113,9 +115,16 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None:
def test_awq_models(
vllm_runner,
image_assets,
source_model,
quant_model,
size_factors,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_awq_test(
vllm_runner,
image_assets,

View File

@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
"""
from dataclasses import dataclass
import pytest
@ -24,8 +25,10 @@ class ModelPair:
model_pairs = [
ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
ModelPair(
model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
model_gptq="hxbgsyxh/opt-125m-4bit-128g",
),
]
@ -43,16 +46,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_bitblas,
dtype=dtype,
quantization="bitblas") as bitblas_model:
with vllm_runner(
model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
) as bitblas_model:
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@ -1,9 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
'''Tests whether bitsandbytes computation is enabled correctly.
"""Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
"""
import pytest
from transformers import BitsAndBytesConfig
@ -15,8 +15,10 @@ from ..utils import check_embeddings_close, check_logprobs_close
models_4bit_to_test = [
("facebook/opt-125m", "quantize opt model inflight"),
("mistralai/Mistral-7B-Instruct-v0.3",
"quantize inflight model with both HF and Mistral format weights")
(
"mistralai/Mistral-7B-Instruct-v0.3",
"quantize inflight model with both HF and Mistral format weights",
),
]
models_4bit_to_embedding_test = [
@ -28,72 +30,84 @@ models_4bit_to_moe_test = [
]
models_pre_qaunt_4bit_to_test = [
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
'read pre-quantized 4-bit FP4 model'),
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
(
"PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
"read pre-quantized 4-bit FP4 model",
),
("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, False, hf_model_kwargs)
def test_load_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test)
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, True)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
def test_load_pre_quant_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, True
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test)
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, True)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
def test_load_8bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, True
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner,
vllm_runner,
example_prompts[:1],
model_name,
False,
hf_model_kwargs,
vllm_tp_size=2)
def test_load_tp_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
validate_generated_texts(
hf_runner,
vllm_runner,
example_prompts[:1],
model_name,
False,
hf_model_kwargs,
vllm_tp_size=2,
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@ -115,30 +129,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings(model_name, common_args, pp_args)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
def test_4bit_bnb_moe_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
)
with vllm_runner(
model_name,
quantization="bitsandbytes",
enforce_eager=False,
default_torch_num_threads=1,
) as llm:
vllm_outputs = llm.generate_greedy_logprobs(
example_prompts, max_tokens=32, num_logprobs=5
)
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
))
with vllm_runner(model_name,
quantization='bitsandbytes',
enforce_eager=False,
default_torch_num_threads=1) as llm:
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
max_tokens=32,
num_logprobs=5)
with hf_runner(model_name,
model_kwargs=hf_model_kwargs,
default_torch_num_threads=1) as llm:
with hf_runner(
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
) as llm:
transformers_outputs = llm.generate_greedy_logprobs_limit(
example_prompts, max_tokens=32, num_logprobs=5)
example_prompts, max_tokens=32, num_logprobs=5
)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs,
@ -147,10 +168,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_4bit_to_embedding_test)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
def test_4bit_bnb_embedding_model(
model_name,
@ -160,7 +182,6 @@ def test_4bit_bnb_embedding_model(
example_prompts,
dtype: str,
) -> None:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
@ -170,22 +191,23 @@ def test_4bit_bnb_embedding_model(
example_prompts = [str(s).strip() for s in example_prompts]
# Inflight 4bit quantization
with vllm_runner(model_name,
runner="pooling",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes",
default_torch_num_threads=1) as vllm_model:
with vllm_runner(
model_name,
runner="pooling",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes",
default_torch_num_threads=1,
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
with hf_runner(
model_name,
dtype=dtype,
model_kwargs=hf_model_kwargs,
is_sentence_transformer=True,
default_torch_num_threads=1,
model_name,
dtype=dtype,
model_kwargs=hf_model_kwargs,
is_sentence_transformer=True,
default_torch_num_threads=1,
) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
@ -210,23 +232,25 @@ def log_generated_texts(prompts, outputs, runner_name):
return logged_texts
def validate_generated_texts(hf_runner,
vllm_runner,
prompts,
model_name,
pre_quant=False,
hf_model_kwargs=None,
vllm_tp_size=1,
max_tokens=8):
def validate_generated_texts(
hf_runner,
vllm_runner,
prompts,
model_name,
pre_quant=False,
hf_model_kwargs=None,
vllm_tp_size=1,
max_tokens=8,
):
# NOTE: run vLLM first, as it requires a clean process
# when using distributed inference
with vllm_runner(model_name,
quantization=None if pre_quant else 'bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=False,
default_torch_num_threads=1) as llm:
with vllm_runner(
model_name,
quantization=None if pre_quant else "bitsandbytes",
tensor_parallel_size=vllm_tp_size,
enforce_eager=False,
default_torch_num_threads=1,
) as llm:
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
@ -234,9 +258,9 @@ def validate_generated_texts(hf_runner,
hf_model_kwargs = {}
# Run with HF runner
with hf_runner(model_name,
model_kwargs=hf_model_kwargs,
default_torch_num_threads=1) as llm:
with hf_runner(
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
) as llm:
hf_outputs = llm.generate_greedy(prompts, max_tokens)
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
@ -245,8 +269,10 @@ def validate_generated_texts(hf_runner,
hf_str = hf_log["generated_text"]
vllm_str = vllm_log["generated_text"]
prompt = hf_log["prompt"]
assert hf_str == vllm_str, (f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"
f"HF Output: '{hf_str}'\n"
f"vLLM Output: '{vllm_str}'")
assert hf_str == vllm_str, (
f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"
f"HF Output: '{hf_str}'\n"
f"vLLM Output: '{vllm_str}'"
)

View File

@ -5,6 +5,7 @@
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
@ -14,21 +15,33 @@ from vllm.utils import STR_BACKEND_ENV_VAR
from ..utils import check_logprobs_close
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model",
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
(
"fp8_e4m3",
"meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
),
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
(
"fp8_e5m2",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct")
])
(
"fp8_e4m3",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
],
)
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [True])
@ -54,38 +67,39 @@ def test_models(
"""
if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
pytest.skip(
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv("TOKENIZERS_PARALLELISM", "true")
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
@ -96,15 +110,18 @@ def test_models(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(),
reason="test for the CPU backend.")
@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
@pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model",
[
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
])
(
"fp8_e5m2",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
],
)
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
def test_cpu_models(
@ -121,28 +138,30 @@ def test_cpu_models(
numerical sensitive kernels.
"""
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv("TOKENIZERS_PARALLELISM", "true")
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,

View File

@ -100,35 +100,37 @@ def check_model_outputs(
):
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
if tokenizer.chat_template is not None:
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Run gguf model.
with vllm_runner(model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model:
with vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size,
) as gguf_model:
gguf_outputs = gguf_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
# Run unquantized model.
# Should run with tp=1, otherwise the test will stuck at
# nccl initialization.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as original_model:
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=original_outputs,
@ -138,12 +140,14 @@ def check_model_outputs(
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS
])
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@ -157,12 +161,15 @@ def test_models(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@ -178,5 +185,6 @@ def test_distributed(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)

View File

@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
"""
from dataclasses import dataclass
import pytest
@ -41,16 +42,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_gptq,
dtype=dtype,
quantization="bitblas") as bitblas_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="bitblas"
) as bitblas_model:
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
"""
import os
import pytest
@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
MODELS = [
# act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
# 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
# 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
]
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="gptq_marlin is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@ -55,29 +56,34 @@ def test_models(
model_name, revision = model
# Run marlin.
with vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_marlin_model:
with vllm_runner(
model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as gptq_marlin_model:
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
example_prompts[:-1], max_tokens, num_logprobs
)
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
# Run gptq.
# The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel.
with vllm_runner(model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_model:
with vllm_runner(
model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
example_prompts[:-1], max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
"""
from dataclasses import dataclass
import pytest
@ -24,15 +25,18 @@ class ModelPair:
model_pairs = [
# 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
ModelPair(
model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
),
# # 4-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
# 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
ModelPair(
model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
),
# # 8-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@ -40,10 +44,12 @@ model_pairs = [
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="Marlin24 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin_24")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="Marlin24 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@ -56,16 +62,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24") as marlin_24_model:
with vllm_runner(
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
) as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@ -5,6 +5,7 @@
"""Tests Model Optimizer fp8 models against ground truth generation
Note: these tests will only pass on H100
"""
import os
import pytest
@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.1-8B-Instruct-FP8": [
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
"The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
"**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
"The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
"Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
]
}
@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
reason="Prevent unstable test based on golden strings from breaking the build."
)
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
llm = LLM(
@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
)

View File

@ -1,8 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# flake8: noqa
"""Tests Quark mxfp4 models against ground truth generation
"""
"""Tests Quark mxfp4 models against ground truth generation"""
import pytest
from vllm import LLM, SamplingParams
@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
EXPECTED_STRS_MAP = {
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
'\n### Key Features\n\n* **High-throughput Inference**: vLL',
'\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
'\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
'\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
"\n### Key Features\n\n* **High-throughput Inference**: vLL",
"\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
"Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
"A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
"\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
"\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
"The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
]
}
@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
output_str = output.outputs[0].text
expected_str = EXPECTED_STRS_MAP[model_name][i]
assert expected_str == output_str, (
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
)

View File

@ -4,6 +4,7 @@
"""Tests Model Optimizer nvfp4 models against ground truth generation
Note: these tests will only pass on B200
"""
import os
from typing import List
@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.3-70B-Instruct-FP4": [
'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
"vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
"Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
"A neural network is a type of machine learning model inspired by the structure and function of the human brain",
"In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
"The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
"Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
]
}
@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.")
@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
reason="modelopt_fp4 is not supported on this GPU type.")
reason="Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@pytest.mark.skipif(
not is_quant_method_supported("modelopt_fp4"),
reason="modelopt_fp4 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
llm = LLM(
@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
)

View File

@ -114,8 +114,10 @@ class _HfExamplesInfo:
If the installed transformers version does not meet the requirements,
perform the given action.
"""
if (self.min_transformers_version is None
and self.max_transformers_version is None):
if (
self.min_transformers_version is None
and self.max_transformers_version is None
):
return None
current_version = TRANSFORMERS_VERSION
@ -125,11 +127,17 @@ class _HfExamplesInfo:
msg = f"`transformers=={current_version}` installed, but `transformers"
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if (check_min_version and min_version
and Version(cur_base_version) < Version(min_version)):
if (
check_min_version
and min_version
and Version(cur_base_version) < Version(min_version)
):
msg += f">={min_version}` is required to run this model."
elif (check_max_version and max_version
and Version(cur_base_version) > Version(max_version)):
elif (
check_max_version
and max_version
and Version(cur_base_version) > Version(max_version)
):
msg += f"<={max_version}` is required to run this model."
else:
return None

View File

@ -8,13 +8,19 @@ import pytest
from vllm import LLM
from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import (generate_scheduler_kv_cache_config,
get_kv_cache_configs)
from vllm.v1.core.kv_cache_utils import (
generate_scheduler_kv_cache_config,
get_kv_cache_configs,
)
from vllm.v1.engine.core import EngineCore as V1EngineCore
from ..utils import create_new_process_for_each_test
from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
HF_EXAMPLE_MODELS, HfExampleModels)
from .registry import (
_TRANSFORMERS_BACKEND_MODELS,
AUTO_EXAMPLE_MODELS,
HF_EXAMPLE_MODELS,
HfExampleModels,
)
from .utils import dummy_hf_overrides
# This minimal list of model architectures is smaller than the total list of
@ -24,23 +30,32 @@ from .utils import dummy_hf_overrides
# generation, sequence classification, causal LM, ranking, chat, reward model,
# multimodal, geospatial, voice, embedding, MTP)
MINIMAL_MODEL_ARCH_LIST = [
"LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
"BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
"InternVLChatModel", "InternLM2ForRewardModel",
"TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
"DeepSeekMTPModel", "XLMRobertaModel"
"LlavaForConditionalGeneration",
"Llama4ForConditionalGeneration",
"BertForSequenceClassification",
"Gemma3nForCausalLM",
"JinaVLForRanking",
"InternVLChatModel",
"InternLM2ForRewardModel",
"TransformersForMultimodalLM",
"PrithviGeoSpatialMAE",
"UltravoxModel",
"DeepSeekMTPModel",
"XLMRobertaModel",
]
# This list is the complement of the minimal list above. The intention is that
# this list of models is only tested in a "special case" i.e. most PRs should
# not test these models
OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
set(MINIMAL_MODEL_ARCH_LIST))
OTHER_MODEL_ARCH_LIST = set(HF_EXAMPLE_MODELS.get_supported_archs()) - set(
MINIMAL_MODEL_ARCH_LIST
)
@create_new_process_for_each_test()
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
EXAMPLE_MODELS: HfExampleModels):
def can_initialize(
model_arch: str, monkeypatch: pytest.MonkeyPatch, EXAMPLE_MODELS: HfExampleModels
):
"""The reason for using create_new_process_for_each_test is to avoid
the WARNING:
"We must use the 'spawn' multiprocessing start method. Overriding
@ -53,12 +68,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
hf_overrides_fn = partial(dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides,
use_original_num_layers=getattr(
model_info, 'use_original_num_layers',
False))
hf_overrides_fn = partial(
dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides,
use_original_num_layers=getattr(model_info, "use_original_num_layers", False),
)
# Avoid calling model.forward()
def _initialize_kv_caches_v1(self, vllm_config):
@ -68,14 +83,15 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
kv_cache_specs,
[10 * GiB_bytes],
)
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
kv_cache_configs)
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return 1, 0, scheduler_kv_cache_config
with (patch.object(V1EngineCore, "_initialize_kv_caches",
_initialize_kv_caches_v1), monkeypatch.context() as m):
with (
patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
monkeypatch.context() as m,
):
if model_info.v0_only:
# NOTE(woosuk): skip the test for V0-only models
return
@ -97,21 +113,24 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
} if model_info.speculative_model else None,
}
if model_info.speculative_model
else None,
trust_remote_code=model_info.trust_remote_code,
max_model_len=model_info.max_model_len,
# these tests seem to produce leftover memory
gpu_memory_utilization=0.80,
load_format="dummy",
model_impl="transformers"
if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
if model_arch in _TRANSFORMERS_BACKEND_MODELS
else "vllm",
hf_overrides=hf_overrides_fn,
max_num_seqs=model_info.max_num_seqs)
max_num_seqs=model_info.max_num_seqs,
)
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
def test_can_initialize_small_subset(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"""Test initializing small subset of supported models"""
if model_arch == "Lfm2ForCausalLM":
pytest.skip("Skipping until test supports V1-only models")
@ -119,10 +138,9 @@ def test_can_initialize_small_subset(model_arch: str,
@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
def test_can_initialize_large_subset(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"""Test initializing large subset of supported models
This test covers the complement of the tests covered in the "small subset"
test.
"""
@ -131,8 +149,6 @@ def test_can_initialize_large_subset(model_arch: str,
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
@pytest.mark.parametrize("model_arch",
AUTO_EXAMPLE_MODELS.get_supported_archs())
def test_implicit_converted_models(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("model_arch", AUTO_EXAMPLE_MODELS.get_supported_archs())
def test_implicit_converted_models(model_arch: str, monkeypatch: pytest.MonkeyPatch):
can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)

View File

@ -50,9 +50,9 @@ def test_oot_registration_embedding(
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path,
load_format="dummy",
max_model_len=2048)
llm = LLM(
model=dummy_gemma2_embedding_path, load_format="dummy", max_model_len=2048
)
outputs = llm.embed(prompts)
for output in outputs:
@ -69,27 +69,28 @@ def test_oot_registration_multimodal(
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = [{
"prompt": "What's in the image?<image>",
"multi_modal_data": {
"image": image
prompts = [
{
"prompt": "What's in the image?<image>",
"multi_modal_data": {"image": image},
},
}, {
"prompt": "Describe the image<image>",
"multi_modal_data": {
"image": image
{
"prompt": "Describe the image<image>",
"multi_modal_data": {"image": image},
},
}]
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_llava_path,
load_format="dummy",
max_num_seqs=1,
trust_remote_code=True,
gpu_memory_utilization=0.98,
max_model_len=4096,
enforce_eager=True,
limit_mm_per_prompt={"image": 1})
llm = LLM(
model=dummy_llava_path,
load_format="dummy",
max_num_seqs=1,
trust_remote_code=True,
gpu_memory_utilization=0.98,
max_model_len=4096,
enforce_eager=True,
limit_mm_per_prompt={"image": 1},
)
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)

View File

@ -6,16 +6,22 @@ import warnings
import pytest
import torch.cuda
from vllm.model_executor.models import (is_pooling_model,
is_text_generation_model,
supports_multimodal)
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
_SPECULATIVE_DECODING_MODELS,
_TEXT_GENERATION_MODELS,
ModelRegistry)
from vllm.model_executor.models import (
is_pooling_model,
is_text_generation_model,
supports_multimodal,
)
from vllm.model_executor.models.adapters import (
as_embedding_model,
as_reward_model,
as_seq_cls_model,
)
from vllm.model_executor.models.registry import (
_MULTIMODAL_MODELS,
_SPECULATIVE_DECODING_MODELS,
_TEXT_GENERATION_MODELS,
ModelRegistry,
)
from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test
@ -34,8 +40,7 @@ def test_registry_imports(model_arch):
if model_arch in _SPECULATIVE_DECODING_MODELS:
return # Ignore these models which do not have a unified format
if (model_arch in _TEXT_GENERATION_MODELS
or model_arch in _MULTIMODAL_MODELS):
if model_arch in _TEXT_GENERATION_MODELS or model_arch in _MULTIMODAL_MODELS:
assert is_text_generation_model(model_cls)
# All vLLM models should be convertible to a pooling model
@ -48,13 +53,16 @@ def test_registry_imports(model_arch):
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
("LlamaForCausalLM", False, False, False),
("LlavaForConditionalGeneration", True, True, False),
("BertForSequenceClassification", False, False, True),
("RobertaForSequenceClassification", False, False, True),
("XLMRobertaForSequenceClassification", False, False, True),
])
@pytest.mark.parametrize(
"model_arch,is_mm,init_cuda,is_ce",
[
("LlamaForCausalLM", False, False, False),
("LlavaForConditionalGeneration", True, True, False),
("BertForSequenceClassification", False, False, True),
("RobertaForSequenceClassification", False, False, True),
("XLMRobertaForSequenceClassification", False, False, True),
],
)
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
@ -70,7 +78,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
warnings.warn(
"This model no longer initializes CUDA on import. "
"Please test using a different one.",
stacklevel=2)
stacklevel=2,
)
@create_new_process_for_each_test()
@ -82,7 +91,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
# ("MLPSpeculatorPreTrainedModel", False, False),
("DeepseekV2ForCausalLM", True, False),
("Qwen2VLForConditionalGeneration", True, True),
])
],
)
def test_registry_is_pp(model_arch, is_pp, init_cuda):
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
@ -97,13 +107,16 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
warnings.warn(
"This model no longer initializes CUDA on import. "
"Please test using a different one.",
stacklevel=2)
stacklevel=2,
)
def test_hf_registry_coverage():
untested_archs = (ModelRegistry.get_supported_archs() -
HF_EXAMPLE_MODELS.get_supported_archs())
untested_archs = (
ModelRegistry.get_supported_archs() - HF_EXAMPLE_MODELS.get_supported_archs()
)
assert not untested_archs, (
"Please add the following architectures to "
f"`tests/models/registry.py`: {untested_archs}")
f"`tests/models/registry.py`: {untested_archs}"
)

View File

@ -11,32 +11,33 @@ from tests.conftest import VllmRunner
"model",
[
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
"mgazz/Prithvi_v2_eo_300_tl_unet_agb"
"mgazz/Prithvi_v2_eo_300_tl_unet_agb",
],
)
def test_inference(
vllm_runner: type[VllmRunner],
model: str,
) -> None:
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
prompt = dict(prompt_token_ids=[1],
multi_modal_data=dict(pixel_values=pixel_values,
location_coords=location_coords))
prompt = dict(
prompt_token_ids=[1],
multi_modal_data=dict(
pixel_values=pixel_values, location_coords=location_coords
),
)
with vllm_runner(
model,
runner="pooling",
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
default_torch_num_threads=1,
model,
runner="pooling",
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
default_torch_num_threads=1,
) as vllm_model:
vllm_output = vllm_model.llm.encode(prompt)
assert torch.equal(
torch.isnan(vllm_output[0].outputs.data).any(),
torch.tensor(False))
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test the functionality of the Transformers backend."""
from typing import Any, Optional, Union
import pytest
@ -60,14 +61,16 @@ def check_implementation(
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
)
@pytest.mark.parametrize(
"model,model_impl",
[
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("hmellor/Ilama-3.2-1B", "auto"), # CUSTOM CODE
("allenai/OLMoE-1B-7B-0924", "transformers"), # MoE
]) # trust_remote_code=True by default
],
) # trust_remote_code=True by default
def test_models(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
@ -77,29 +80,32 @@ def test_models(
) -> None:
import transformers
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("4.57.0.dev0")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip("MoE models with the Transformers backend require "
f"transformers>={required}, but got {installed}")
pytest.skip(
"MoE models with the Transformers backend require "
f"transformers>={required}, but got {installed}"
)
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
check_implementation(
hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
)
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
prompts, _, _ = prep_prompts(4, (800, 801))
kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
kwargs_test = {"model_impl": "transformers", **kwargs_ref}
check_implementation(vllm_runner,
vllm_runner,
prompts,
model="hmellor/tiny-random-Gemma2ForCausalLM",
kwargs_ref=kwargs_ref,
kwargs_test=kwargs_test)
check_implementation(
vllm_runner,
vllm_runner,
prompts,
model="hmellor/tiny-random-Gemma2ForCausalLM",
kwargs_ref=kwargs_ref,
kwargs_test=kwargs_test,
)
@multi_gpu_test(num_gpus=2)
@ -109,23 +115,28 @@ def test_distributed(
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
check_implementation(hf_runner,
vllm_runner,
example_prompts,
"meta-llama/Llama-3.2-1B-Instruct",
kwargs_test=kwargs)
@pytest.mark.parametrize("model, quantization_kwargs", [
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
(
check_implementation(
hf_runner,
vllm_runner,
example_prompts,
"meta-llama/Llama-3.2-1B-Instruct",
{
"quantization": "bitsandbytes",
},
),
])
kwargs_test=kwargs,
)
@pytest.mark.parametrize(
"model, quantization_kwargs",
[
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
(
"meta-llama/Llama-3.2-1B-Instruct",
{
"quantization": "bitsandbytes",
},
),
],
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
@ -136,27 +147,34 @@ def test_quantization(
max_tokens: int,
num_logprobs: int,
) -> None:
if (current_platform.is_rocm()
and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
pytest.skip(
"bitsandbytes quantization is currently not supported in rocm.")
if (
current_platform.is_rocm()
and quantization_kwargs.get("quantization", "") == "bitsandbytes"
):
pytest.skip("bitsandbytes quantization is currently not supported in rocm.")
with vllm_runner(
model, model_impl="auto", enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
model,
model_impl="auto",
enforce_eager=True,
**quantization_kwargs, # type: ignore[arg-type]
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
)
with vllm_runner(
model,
model_impl="transformers",
enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
model,
model_impl="transformers",
enforce_eager=True,
**quantization_kwargs, # type: ignore[arg-type]
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
transformers_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
@ -172,22 +190,24 @@ def test_quantization(
# Layers live in `layers`
"Qwen/Qwen3-Embedding-0.6B",
# Layers live in `model.layers`
"meta-llama/Llama-3.2-1B-Instruct"
"meta-llama/Llama-3.2-1B-Instruct",
],
)
def test_embed_loading(vllm_runner, model):
with vllm_runner(model,
max_model_len=1024,
enforce_eager=True,
runner="pooling",
model_impl="transformers") as model_test:
with vllm_runner(
model,
max_model_len=1024,
enforce_eager=True,
runner="pooling",
model_impl="transformers",
) as model_test:
model_config = model_test.llm.llm_engine.model_config
assert model_config.using_transformers_backend()
@pytest.mark.parametrize(
"arch",
["TransformersEmbeddingModel", "TransformersForSequenceClassification"])
"arch", ["TransformersEmbeddingModel", "TransformersForSequenceClassification"]
)
def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
model = get_model(arch)
@ -202,6 +222,7 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
hf_kwargs["is_sentence_transformer"] = True
elif arch == "TransformersForSequenceClassification":
from transformers import AutoModelForSequenceClassification
hf_kwargs["auto_cls"] = AutoModelForSequenceClassification
# The example_prompts has ending "\n", for example:
@ -212,8 +233,10 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
# So we need to strip the input texts to avoid test failing.
example_prompts = [str(s).strip() for s in example_prompts]
with (vllm_runner(model, **vllm_kwargs) as
vllm_model, hf_runner(model, **hf_kwargs) as hf_model):
with (
vllm_runner(model, **vllm_kwargs) as vllm_model,
hf_runner(model, **hf_kwargs) as hf_model,
):
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.using_transformers_backend()

View File

@ -10,7 +10,6 @@ pytestmark = pytest.mark.cpu_test
class ModuleWithBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.bn = torch.nn.BatchNorm1d(2)
@ -20,7 +19,6 @@ class ModuleWithBatchNorm(torch.nn.Module):
class ModuleWithNestedBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.nested_mod = ModuleWithBatchNorm()
@ -67,9 +65,11 @@ def test_module_with_child_containing_batchnorm_can_autoload():
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod)
@ -77,9 +77,9 @@ def test_module_with_child_containing_batchnorm_can_autoload():
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
@ -101,9 +101,11 @@ def test_module_skip_prefix():
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
@ -111,9 +113,9 @@ def test_module_skip_prefix():
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
@ -137,9 +139,11 @@ def test_module_skip_substr():
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
@ -147,7 +151,7 @@ def test_module_skip_substr():
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
)
assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1

View File

@ -8,11 +8,16 @@ import torch.multiprocessing as mp
from tests.utils import multi_gpu_test
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.distributed.parallel_state import (init_distributed_environment,
initialize_model_parallel)
from vllm.distributed.parallel_state import (
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.models.vision import (
get_load_balance_assignment, resolve_visual_encoder_outputs,
run_dp_sharded_mrope_vision_model, run_dp_sharded_vision_model)
get_load_balance_assignment,
resolve_visual_encoder_outputs,
run_dp_sharded_mrope_vision_model,
run_dp_sharded_vision_model,
)
from vllm.platforms import current_platform
from vllm.utils import get_open_port, update_environment_variables
@ -20,8 +25,7 @@ pytestmark = pytest.mark.cpu_test
@pytest.mark.parametrize(
("select_layers", "num_layers_loaded", "max_possible_layers",
"expected_features"),
("select_layers", "num_layers_loaded", "max_possible_layers", "expected_features"),
[
# All layers loaded
([1, 10], 10, 10, [1, 10]),
@ -29,16 +33,15 @@ pytestmark = pytest.mark.cpu_test
# Some layers not loaded
([1, 10], 10, 20, [1, 10]),
([-20, -11], 10, 20, [1, 10]),
])
def test_resolve_visual_encoder_outputs(select_layers, num_layers_loaded,
max_possible_layers,
expected_features):
],
)
def test_resolve_visual_encoder_outputs(
select_layers, num_layers_loaded, max_possible_layers, expected_features
):
"""
Test that offsets are correctly handled for vision feature layers.
"""
encoder_outputs = [
torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
]
encoder_outputs = [torch.tensor([idx]) for idx in range(num_layers_loaded + 1)]
output_tensor = resolve_visual_encoder_outputs(
encoder_outputs=encoder_outputs,
post_layer_norm=None,
@ -85,10 +88,11 @@ def test_run_dp_sharded_vision_model(batch_size: int):
)
def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
batch_size: int, master_port: int):
def run_dp_sharded_vision_model_vs_direct(
local_rank: int, world_size: int, batch_size: int, master_port: int
):
"""
Test that run_dp_sharded_vision_model produces the same results as
Test that run_dp_sharded_vision_model produces the same results as
calling the model directly.
"""
@ -99,13 +103,15 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
current_platform.set_device(device)
torch.set_default_device(device)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': str(master_port),
})
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": str(master_port),
}
)
# initialize distributed
init_distributed_environment()
@ -141,28 +147,45 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
[
# Empty input
([], 2, [], [0, 0], [0, 0], "empty input"),
# Fewer samples than GPUs
([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0
], "fewer samples than GPUs"),
(
[100, 200],
4,
[1, 0],
[1, 1, 0, 0],
[200, 100, 0, 0],
"fewer samples than GPUs",
),
# Single GPU
([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"),
# Balanced assignment
([100, 100, 100, 100
], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"),
(
[100, 100, 100, 100],
2,
[0, 2, 1, 3],
[2, 2],
[200, 200],
"balanced assignment",
),
# Unbalanced sizes - this one is trickier since the algorithm is greedy
([1000, 100, 200, 50], 2, [0, 2, 1, 3
], [1, 3], [1000, 350], "unbalanced sizes"),
(
[1000, 100, 200, 50],
2,
[0, 2, 1, 3],
[1, 3],
[1000, 350],
"unbalanced sizes",
),
],
)
def test_get_load_balance_assignment_cases(sizes, num_gpus,
expected_shuffle_indices,
expected_gpu_sample_counts,
expected_grouped_sizes_per_gpu,
test_description):
def test_get_load_balance_assignment_cases(
sizes,
num_gpus,
expected_shuffle_indices,
expected_gpu_sample_counts,
expected_grouped_sizes_per_gpu,
test_description,
):
"""Test get_load_balance_assignment with various input cases."""
result = get_load_balance_assignment(sizes, num_gpus=num_gpus)
(shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result
@ -188,8 +211,7 @@ class SimpleMRopeVisionModel(torch.nn.Module):
self.out_hidden_size = out_hidden_size
self.linear = torch.nn.Linear(768, out_hidden_size)
def forward(self, pixel_values: torch.Tensor,
grid_thw_list: list[list[int]]):
def forward(self, pixel_values: torch.Tensor, grid_thw_list: list[list[int]]):
"""Simple forward pass that simulates spatial merging."""
# Apply linear transformation
embeddings = self.linear(pixel_values)
@ -212,8 +234,9 @@ class SimpleMRopeVisionModel(torch.nn.Module):
merged_patches = num_patches // merge_factor
if merged_patches > 0:
# Reshape and average to simulate merging
reshaped = image_patches[:merged_patches * merge_factor].view(
merged_patches, merge_factor, -1)
reshaped = image_patches[: merged_patches * merge_factor].view(
merged_patches, merge_factor, -1
)
merged = reshaped.mean(dim=1)
merged_embeddings.append(merged)
@ -222,9 +245,11 @@ class SimpleMRopeVisionModel(torch.nn.Module):
if merged_embeddings:
return torch.cat(merged_embeddings, dim=0)
else:
return torch.empty((0, self.out_hidden_size),
device=pixel_values.device,
dtype=pixel_values.dtype)
return torch.empty(
(0, self.out_hidden_size),
device=pixel_values.device,
dtype=pixel_values.dtype,
)
@multi_gpu_test(num_gpus=2)
@ -250,12 +275,11 @@ def test_run_dp_sharded_mrope_vision_model(batch_size: int):
)
def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
world_size: int,
batch_size: int,
master_port: int):
def run_dp_sharded_mrope_vision_model_vs_direct(
local_rank: int, world_size: int, batch_size: int, master_port: int
):
"""
Test that run_dp_sharded_mrope_vision_model produces the same results as
Test that run_dp_sharded_mrope_vision_model produces the same results as
calling the model directly.
"""
# Set random seed for reproducibility
@ -264,13 +288,15 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
current_platform.set_device(device)
torch.set_default_device(device)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': str(master_port),
})
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": str(master_port),
}
)
# initialize distributed
init_distributed_environment()
@ -303,10 +329,9 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
# Run the model through the sharded function
with torch.inference_mode():
sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
sharded_output = run_dp_sharded_mrope_vision_model(
vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
)
sharded_output = torch.cat(sharded_output, dim=0)
# Check that the world size is set up correctly
@ -317,10 +342,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
# Check that the outputs have the same shape
assert direct_output.shape == sharded_output.shape
# Check that the outputs are close (they should be identical)
assert torch.allclose(direct_output,
sharded_output,
rtol=1e-5,
atol=1e-5)
assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
@multi_gpu_test(num_gpus=2)
@ -334,20 +356,23 @@ def test_run_dp_sharded_mrope_vision_model_empty_input():
def run_dp_sharded_mrope_vision_model_empty_input_worker(
local_rank: int, world_size: int, master_port: int):
local_rank: int, world_size: int, master_port: int
):
"""Test run_dp_sharded_mrope_vision_model with empty input."""
# Set up distributed environment
device = f"{current_platform.device_name}:{local_rank}"
current_platform.set_device(device)
torch.set_default_device(device)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': str(master_port),
})
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": str(master_port),
}
)
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
@ -360,10 +385,9 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
# Should handle empty input gracefully
with torch.inference_mode():
output = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
output = run_dp_sharded_mrope_vision_model(
vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
)
assert len(output) == 0
@ -379,7 +403,8 @@ def test_run_dp_sharded_mrope_vision_model_uneven_load():
def run_dp_sharded_mrope_vision_model_uneven_load_worker(
local_rank: int, world_size: int, master_port: int):
local_rank: int, world_size: int, master_port: int
):
"""Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
# Set up distributed environment
current_platform.seed_everything(123)
@ -387,13 +412,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
current_platform.set_device(device)
torch.set_default_device(device)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': str(master_port),
})
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": str(master_port),
}
)
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
@ -401,7 +428,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
# Create images with very different sizes
grid_thw_list = [
[1, 2, 2], # Small: 4 patches
[1, 8, 8], # Large: 64 patches
[1, 8, 8], # Large: 64 patches
[1, 3, 3], # Medium: 9 patches
]
@ -416,15 +443,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
# Should handle uneven distribution without errors
with torch.inference_mode():
output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
pixel_values,
grid_thw_list,
rope_type="rope_3d")
output_tuple = run_dp_sharded_mrope_vision_model(
vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
)
# Verify output shape is reasonable
merge_factor = vision_model.spatial_merge_size**2
expected_output_patches = list(
math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list)
math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list
)
for i, output in enumerate(output_tuple):
assert output.shape[0] == expected_output_patches[i]
@ -445,8 +472,9 @@ def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int):
pixel_values_list.append(image_pixels)
pixel_values = torch.cat(pixel_values_list, dim=0)
vision_model = SimpleMRopeVisionModel(
spatial_merge_size=spatial_merge_size).to(device)
vision_model = SimpleMRopeVisionModel(spatial_merge_size=spatial_merge_size).to(
device
)
with torch.inference_mode():
output = vision_model(pixel_values, grid_thw_list)

View File

@ -33,16 +33,18 @@ def check_outputs_equal(
"""
assert len(outputs_0_lst) == len(outputs_1_lst)
for prompt_idx, (outputs_0,
outputs_1) in enumerate(zip(outputs_0_lst,
outputs_1_lst)):
for prompt_idx, (outputs_0, outputs_1) in enumerate(
zip(outputs_0_lst, outputs_1_lst)
):
output_ids_0, output_str_0 = outputs_0
output_ids_1, output_str_1 = outputs_1
# The text and token outputs should exactly match
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}")
fail_msg = (
f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}"
)
assert output_str_0 == output_str_1, fail_msg
assert output_ids_0 == output_ids_1, fail_msg
@ -54,9 +56,9 @@ def check_outputs_equal(
# * List of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
float]],
SampleLogprobs]]]
TokensTextLogprobs = tuple[
list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]
]
# Allow for tokens to be represented as str's rather than IDs;
# tuple of
@ -65,9 +67,9 @@ TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
# * Optional list of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
list[dict[str,
Logprob]]]]]
TextTextLogprobs = tuple[
list[str], str, Optional[Union[list[dict[str, float]], list[dict[str, Logprob]]]]
]
# Representation of generated sequence as a tuple of
# * Token ID list
@ -77,18 +79,21 @@ TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
#
# Allows prompt logprobs to be requested.
TokensTextLogprobsPromptLogprobs = tuple[
list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
list[int],
str,
Optional[Union[list[dict[int, float]], SampleLogprobs]],
Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]],
]
def check_logprobs_close(
*,
outputs_0_lst: Sequence[Union[TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs,
TextTextLogprobs]],
outputs_1_lst: Sequence[Union[TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs,
TextTextLogprobs]],
outputs_0_lst: Sequence[
Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
],
outputs_1_lst: Sequence[
Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
],
name_0: str,
name_1: str,
num_outputs_0_skip_tokens: int = 0,
@ -128,9 +133,9 @@ def check_logprobs_close(
assert len(outputs_0_lst) == len(outputs_1_lst)
# Loop through responses to each prompt.
for prompt_idx, (outputs_0,
outputs_1) in enumerate(zip(outputs_0_lst,
outputs_1_lst)):
for prompt_idx, (outputs_0, outputs_1) in enumerate(
zip(outputs_0_lst, outputs_1_lst)
):
assert len(outputs_0) == len(outputs_1)
if len(outputs_0) == 3:
assert len(outputs_1) == 3
@ -155,17 +160,18 @@ def check_logprobs_close(
) = outputs_1
# Test prompt logprobs closeness
if (prompt_logprobs_0 is not None
and prompt_logprobs_1 is not None):
if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
# Both sequences' prompt logprobs lists are not `None``
# (although individual list elements may be `None`);
# for each token's logprobs:
for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
zip(prompt_logprobs_0, prompt_logprobs_1)):
zip(prompt_logprobs_0, prompt_logprobs_1)
):
fail_msg = (
f"Prompt logprobs test:"
f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}"
)
if logprobs_elem_0 is None:
# If the seq 0 token's logprobs are `None`,
@ -176,20 +182,24 @@ def check_logprobs_close(
# the seq 1 token's logprobs must not be `None`
assert logprobs_elem_1 is not None, fail_msg
# Logprobs check: top-k token choices must be the same
assert (set(logprobs_elem_0.keys()) == set(
logprobs_elem_1.keys())), fail_msg
assert set(logprobs_elem_0.keys()) == set(
logprobs_elem_1.keys()
), fail_msg
else:
# Both sequence logprobs lists must be `None`
fail_msg = (f"Prompt logprobs test:"
f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
fail_msg = (
f"Prompt logprobs test:"
f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}"
)
assert (prompt_logprobs_0 is None
and prompt_logprobs_1 is None), fail_msg
assert prompt_logprobs_0 is None and prompt_logprobs_1 is None, fail_msg
else:
raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
f"{len(outputs_0)} elements were provided: "
f"{outputs_0}")
raise ValueError(
f"Outputs tuple must have 3 or 4 elements but "
f"{len(outputs_0)} elements were provided: "
f"{outputs_0}"
)
if logprobs_0 is None:
logprobs_0 = [None] * len(output_ids_0)
@ -206,9 +216,9 @@ def check_logprobs_close(
logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
# Loop through generated tokens.
for idx, (output_id_0,
output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
for idx, (output_id_0, output_id_1) in enumerate(
zip(output_ids_0, output_ids_1)
):
is_tok_mismatch = output_id_0 != output_id_1
# If generated tokens don't match
@ -223,7 +233,8 @@ def check_logprobs_close(
f"Test{prompt_idx}:"
f"\nMatched tokens:\t{output_ids_0[:idx]}"
f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}"
)
assert logprobs_elem_0 is not None, fail_msg
assert logprobs_elem_1 is not None, fail_msg
@ -244,9 +255,11 @@ def check_logprobs_close(
if output_str_0 != output_str_1 and warn_on_mismatch:
# The token outputs exactly match,
# so the text outputs should exactly match as well
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}")
fail_msg = (
f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}"
)
with warnings.catch_warnings():
# This ensures that repeated warnings are shown
@ -317,18 +330,22 @@ def check_embeddings_close(
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
zip(embeddings_0_lst, embeddings_1_lst)
):
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}"
)
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
sim = F.cosine_similarity(
torch.tensor(embeddings_0), torch.tensor(embeddings_1), dim=0
)
fail_msg = (f"Test{prompt_idx}:"
f"\nCosine similarity: \t{sim:.4f}"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}")
fail_msg = (
f"Test{prompt_idx}:"
f"\nCosine similarity: \t{sim:.4f}"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}"
)
assert sim >= 1 - tol, fail_msg
@ -413,20 +430,19 @@ def dummy_hf_overrides(
# Ensure at least 2 expert per group
# Since `grouped_topk` assumes top-2
n_group = getattr(text_config, 'n_group', None)
n_group = getattr(text_config, "n_group", None)
num_experts = n_group * 2 if n_group is not None else 2
# we use three layers for Gemma-3n to check
# both normal layer and kv_shared_layer
if use_original_num_layers:
# Use the original number of layers from the config
num_layers = getattr(text_config, 'num_layers', 1)
num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
num_layers = getattr(text_config, "num_layers", 1)
num_hidden_layers = getattr(text_config, "num_hidden_layers", 1)
else:
# Use minimal layers for testing
num_layers = 1
num_hidden_layers = (3 if model_arch
== "Gemma3nForConditionalGeneration" else 1)
num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
update_dict = {
"num_layers": num_layers,
@ -440,53 +456,63 @@ def dummy_hf_overrides(
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if ModelConfig.get_num_experts(DummyConfig) > 0:
update_dict.update({
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
})
update_dict.update(
{
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
}
)
# Update num_hidden_layers for non-Longcat architectures
if model_arch != "LongcatFlashForCausalLM" \
and model_arch != "LongCatFlashMTPModel":
if model_arch != "LongcatFlashForCausalLM" and model_arch != "LongCatFlashMTPModel":
update_dict["num_hidden_layers"] = num_hidden_layers
text_config.update(update_dict)
if hasattr(hf_config, "vision_config"):
hf_config.vision_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
})
hf_config.vision_config.update(
{
"num_layers": 1,
"num_hidden_layers": 1,
}
)
# e.g.: ibm-granite/granite-speech-3.3-2b
if hasattr(hf_config, "encoder_config"):
hf_config.encoder_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
})
hf_config.encoder_config.update(
{
"num_layers": 1,
"num_hidden_layers": 1,
}
)
# e.g.: Qwen/Qwen2-Audio-7B-Instruct
if hasattr(hf_config, "audio_config"):
hf_config.audio_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
"encoder_layers": 1,
})
hf_config.audio_config.update(
{
"num_layers": 1,
"num_hidden_layers": 1,
"encoder_layers": 1,
}
)
return hf_config
def check_transformers_version(model: str,
min_transformers_version: Optional[str] = None,
max_transformers_version: Optional[str] = None):
def check_transformers_version(
model: str,
min_transformers_version: Optional[str] = None,
max_transformers_version: Optional[str] = None,
):
from .registry import _HfExamplesInfo
return _HfExamplesInfo(model,
min_transformers_version=min_transformers_version,
max_transformers_version=max_transformers_version
).check_transformers_version(on_fail="skip")
return _HfExamplesInfo(
model,
min_transformers_version=min_transformers_version,
max_transformers_version=max_transformers_version,
).check_transformers_version(on_fail="skip")