diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index f7106f016b..d29a199c5d 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -12,9 +12,6 @@ from functools import reduce
from typing import Optional, Union
import jinja2
-
-# yapf conflicts with isort for this block
-# yapf: disable
from vllm_cutlass_library_extension import (
DataType,
EpilogueScheduleTag,
@@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import (
VLLMKernelScheduleTag,
)
-# yapf: enable
-
#
# Generator templating
#
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 2b7f0beab2..acbfd8cda4 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
logger = logging.getLogger()
-# yapf conflicts with isort for this docstring
-# yapf: disable
"""
tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer
@@ -132,7 +130,8 @@ def get_parser():
"can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to "
- "use it.")
+ "use it."
+ )
parser = EngineArgs.add_cli_args(parser)
parser.add_argument(
@@ -144,13 +143,14 @@ def get_parser():
"along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg "
- "tensorizer_config_dict."
+ "tensorizer_config_dict.",
)
- subparsers = parser.add_subparsers(dest='command', required=True)
+ subparsers = parser.add_subparsers(dest="command", required=True)
serialize_parser = subparsers.add_parser(
- 'serialize', help="Serialize a model to `--serialized-directory`")
+ "serialize", help="Serialize a model to `--serialized-directory`"
+ )
serialize_parser.add_argument(
"--suffix",
@@ -163,7 +163,9 @@ def get_parser():
"`--suffix` is `v1`, the serialized model tensors will be "
"saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
- "If none is provided, a random UUID will be used."))
+ "If none is provided, a random UUID will be used."
+ ),
+ )
serialize_parser.add_argument(
"--serialized-directory",
type=str,
@@ -175,108 +177,127 @@ def get_parser():
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not "
- "provided.")
+ "provided.",
+ )
serialize_parser.add_argument(
"--serialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
- help=("A JSON string containing additional keyword arguments to "
- "pass to Tensorizer's TensorSerializer during "
- "serialization."))
+ help=(
+ "A JSON string containing additional keyword arguments to "
+ "pass to Tensorizer's TensorSerializer during "
+ "serialization."
+ ),
+ )
serialize_parser.add_argument(
"--keyfile",
type=str,
required=False,
- help=("Encrypt the model weights with a randomly-generated binary key,"
- " and save the key at this path"))
+ help=(
+ "Encrypt the model weights with a randomly-generated binary key,"
+ " and save the key at this path"
+ ),
+ )
deserialize_parser = subparsers.add_parser(
- 'deserialize',
- help=("Deserialize a model from `--path-to-tensors`"
- " to verify it can be loaded and used."))
+ "deserialize",
+ help=(
+ "Deserialize a model from `--path-to-tensors`"
+ " to verify it can be loaded and used."
+ ),
+ )
deserialize_parser.add_argument(
"--path-to-tensors",
type=str,
required=False,
- help="The local path or S3 URI to the model tensors to deserialize. ")
+ help="The local path or S3 URI to the model tensors to deserialize. ",
+ )
deserialize_parser.add_argument(
"--serialized-directory",
type=str,
required=False,
help="Directory with model artifacts for loading. Assumes a "
- "model.tensors file exists therein. Can supersede "
- "--path-to-tensors.")
+ "model.tensors file exists therein. Can supersede "
+ "--path-to-tensors.",
+ )
deserialize_parser.add_argument(
"--keyfile",
type=str,
required=False,
- help=("Path to a binary key to use to decrypt the model weights,"
- " if the model was serialized with encryption"))
+ help=(
+ "Path to a binary key to use to decrypt the model weights,"
+ " if the model was serialized with encryption"
+ ),
+ )
deserialize_parser.add_argument(
"--deserialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
- help=("A JSON string containing additional keyword arguments to "
- "pass to Tensorizer's `TensorDeserializer` during "
- "deserialization."))
+ help=(
+ "A JSON string containing additional keyword arguments to "
+ "pass to Tensorizer's `TensorDeserializer` during "
+ "deserialization."
+ ),
+ )
TensorizerArgs.add_cli_args(deserialize_parser)
return parser
-def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
- cfg: TensorizerConfig):
+
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
for k, v in extra_cfg.items():
if hasattr(cfg, k):
setattr(cfg, k, v)
logger.info(
"Updating TensorizerConfig with %s from "
- "--model-loader-extra-config provided", k
+ "--model-loader-extra-config provided",
+ k,
)
+
def deserialize(args, tensorizer_config):
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
- llm = LLM(model=args.model,
- load_format="tensorizer",
- tensor_parallel_size=args.tensor_parallel_size,
- model_loader_extra_config=tensorizer_config,
- enable_lora=True,
+ llm = LLM(
+ model=args.model,
+ load_format="tensorizer",
+ tensor_parallel_size=args.tensor_parallel_size,
+ model_loader_extra_config=tensorizer_config,
+ enable_lora=True,
)
sampling_params = SamplingParams(
- temperature=0,
- max_tokens=256,
- stop=["[/assistant]"]
+ temperature=0, max_tokens=256, stop=["[/assistant]"]
)
# Truncating this as the extra text isn't necessary
- prompts = [
- "[user] Write a SQL query to answer the question based on ..."
- ]
+ prompts = ["[user] Write a SQL query to answer the question based on ..."]
# Test LoRA load
print(
llm.generate(
- prompts,
- sampling_params,
- lora_request=LoRARequest("sql-lora",
- 1,
- args.lora_path,
- tensorizer_config_dict = tensorizer_config
- .to_serializable())
+ prompts,
+ sampling_params,
+ lora_request=LoRARequest(
+ "sql-lora",
+ 1,
+ args.lora_path,
+ tensorizer_config_dict=tensorizer_config.to_serializable(),
+ ),
)
)
else:
- llm = LLM(model=args.model,
- load_format="tensorizer",
- tensor_parallel_size=args.tensor_parallel_size,
- model_loader_extra_config=tensorizer_config
+ llm = LLM(
+ model=args.model,
+ load_format="tensorizer",
+ tensor_parallel_size=args.tensor_parallel_size,
+ model_loader_extra_config=tensorizer_config,
)
return llm
@@ -285,17 +306,20 @@ def main():
parser = get_parser()
args = parser.parse_args()
- s3_access_key_id = (getattr(args, 's3_access_key_id', None)
- or os.environ.get("S3_ACCESS_KEY_ID", None))
- s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
- or os.environ.get("S3_SECRET_ACCESS_KEY", None))
- s3_endpoint = (getattr(args, 's3_endpoint', None)
- or os.environ.get("S3_ENDPOINT_URL", None))
+ s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
+ "S3_ACCESS_KEY_ID", None
+ )
+ s3_secret_access_key = getattr(
+ args, "s3_secret_access_key", None
+ ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
+ s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
+ "S3_ENDPOINT_URL", None
+ )
credentials = {
"s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key,
- "s3_endpoint": s3_endpoint
+ "s3_endpoint": s3_endpoint,
}
model_ref = args.model
@@ -309,25 +333,25 @@ def main():
if args.model_loader_extra_config:
extra_config = json.loads(args.model_loader_extra_config)
-
- tensorizer_dir = (args.serialized_directory or
- extra_config.get("tensorizer_dir"))
- tensorizer_uri = (getattr(args, "path_to_tensors", None)
- or extra_config.get("tensorizer_uri"))
+ tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
+ tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
+ "tensorizer_uri"
+ )
if tensorizer_dir and tensorizer_uri:
- parser.error("--serialized-directory and --path-to-tensors "
- "cannot both be provided")
+ parser.error(
+ "--serialized-directory and --path-to-tensors cannot both be provided"
+ )
if not tensorizer_dir and not tensorizer_uri:
- parser.error("Either --serialized-directory or --path-to-tensors "
- "must be provided")
-
+ parser.error(
+ "Either --serialized-directory or --path-to-tensors must be provided"
+ )
if args.command == "serialize":
engine_args = EngineArgs.from_cli_args(args)
- input_dir = tensorizer_dir.rstrip('/')
+ input_dir = tensorizer_dir.rstrip("/")
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1:
@@ -339,15 +363,14 @@ def main():
tensorizer_uri=model_path,
encryption_keyfile=keyfile,
serialization_kwargs=args.serialization_kwargs or {},
- **credentials
+ **credentials,
)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)
- merge_extra_config_with_tensorizer_config(extra_config,
- tensorizer_config)
+ merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize":
@@ -356,11 +379,10 @@ def main():
tensorizer_dir=args.serialized_directory,
encryption_keyfile=keyfile,
deserialization_kwargs=args.deserialization_kwargs or {},
- **credentials
+ **credentials,
)
- merge_extra_config_with_tensorizer_config(extra_config,
- tensorizer_config)
+ merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
deserialize(args, tensorizer_config)
else:
raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 7e3a230b5f..16a4271655 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -8,16 +8,11 @@ import torch
import vllm.envs as envs
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-
-# yapf conflicts with isort for this block
-# yapf: disable
from vllm.compilation.activation_quant_fusion import (
FUSED_OPS,
SILU_MUL_OP,
ActivationQuantFusionPass,
)
-
-# yapf: enable
from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index 68ac52af30..94f0ece497 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -107,10 +107,8 @@ class EPTestSettings:
# NOTE: You can adjust tp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
-# yapf: disable
TEST_MODELS = {
- "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
- trust_remote_code=True),
+ "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
}
@@ -192,22 +190,24 @@ def _compare_tp(
]
try:
- compare_two_settings(model_name,
- ep_args,
- tp_args,
- ep_env,
- tp_env,
- method=method,
- max_wait_seconds=360)
+ compare_two_settings(
+ model_name,
+ ep_args,
+ tp_args,
+ ep_env,
+ tp_env,
+ method=method,
+ max_wait_seconds=360,
+ )
except Exception:
raise
@pytest.mark.parametrize(
- ("model_name", "parallel_setup", "distributed_backend", "runner",
- "test_options"),
+ ("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
- params for model_name, settings in TEST_MODELS.items()
+ params
+ for model_name, settings in TEST_MODELS.items()
for params in settings.iter_params(model_name)
],
)
@@ -220,10 +220,12 @@ def test_ep(
test_options: EPTestOptions,
num_gpus_available,
):
- _compare_tp(model_name,
- parallel_setup,
- distributed_backend,
- runner,
- test_options,
- num_gpus_available,
- method="generate")
+ _compare_tp(
+ model_name,
+ parallel_setup,
+ distributed_backend,
+ runner,
+ test_options,
+ num_gpus_available,
+ method="generate",
+ )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 26ee60c150..119e8e7621 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -100,7 +100,6 @@ class PPTestSettings:
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
-# yapf: disable
TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
@@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = {
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
- "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
+ "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
+ multi_node_only=True, load_format="dummy"
+ ), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
@@ -196,7 +197,6 @@ MULTIMODAL_MODELS = {
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
}
-# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index c0eb0e5ac5..9d367349fc 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -287,29 +287,15 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching
-# yapf: disable
-@pytest.mark.parametrize(("arg", "expected", "option"), [
- (None, None, "mm-processor-kwargs"),
- ("{}", {}, "mm-processor-kwargs"),
- (
- '{"num_crops": 4}',
- {
- "num_crops": 4
- },
- "mm-processor-kwargs"
- ),
- (
- '{"foo": {"bar": "baz"}}',
- {
- "foo":
- {
- "bar": "baz"
- }
- },
- "mm-processor-kwargs"
- ),
-])
-# yapf: enable
+@pytest.mark.parametrize(
+ ("arg", "expected", "option"),
+ [
+ (None, None, "mm-processor-kwargs"),
+ ("{}", {}, "mm-processor-kwargs"),
+ ('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"),
+ ('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"),
+ ],
+)
def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
@@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option):
def test_human_readable_model_len():
# `exit_on_error` disabled to test invalid values below
- parser = EngineArgs.add_cli_args(
- FlexibleArgumentParser(exit_on_error=False))
+ parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False))
args = parser.parse_args([])
assert args.max_model_len is None
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7ddad4d513..975ca53a3a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
_try_extract_ast,
+ apply_mistral_chat_template,
load_chat_template,
parse_chat_messages,
parse_chat_messages_futures,
@@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
-# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
- [(PHI3V_MODEL_ID, "string"),
- (QWEN2VL_MODEL_ID, "openai"),
- (QWEN25VL_MODEL_ID, "openai"),
- (ULTRAVOX_MODEL_ID, "string"),
- (QWEN2AUDIO_MODEL_ID, "openai"),
- (LLAMA_GUARD_MODEL_ID, "openai")],
+ [
+ (PHI3V_MODEL_ID, "string"),
+ (QWEN2VL_MODEL_ID, "openai"),
+ (QWEN25VL_MODEL_ID, "openai"),
+ (ULTRAVOX_MODEL_ID, "string"),
+ (QWEN2AUDIO_MODEL_ID, "openai"),
+ (LLAMA_GUARD_MODEL_ID, "openai"),
+ ],
)
-# yapf: enable
def test_resolve_content_format_hf_defined(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format):
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
- dtype=model_info.dtype)
+ dtype=model_info.dtype,
+ )
tokenizer = get_tokenizer(
model,
@@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
assert resolved_format == expected_format
-# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
- [("Salesforce/blip2-opt-2.7b", "string"),
- ("facebook/chameleon-7b", "string"),
- ("deepseek-ai/deepseek-vl2-tiny", "string"),
- ("adept/fuyu-8b", "string"),
- ("google/paligemma-3b-mix-224", "string"),
- ("Qwen/Qwen-VL", "string"),
- ("Qwen/Qwen-VL-Chat", "string")],
+ [
+ ("Salesforce/blip2-opt-2.7b", "string"),
+ ("facebook/chameleon-7b", "string"),
+ ("deepseek-ai/deepseek-vl2-tiny", "string"),
+ ("adept/fuyu-8b", "string"),
+ ("google/paligemma-3b-mix-224", "string"),
+ ("Qwen/Qwen-VL", "string"),
+ ("Qwen/Qwen-VL-Chat", "string"),
+ ],
)
-# yapf: enable
def test_resolve_content_format_fallbacks(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
- dtype=model_info.dtype)
+ dtype=model_info.dtype,
+ )
tokenizer = get_tokenizer(
model_config.tokenizer,
@@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format):
assert resolved_format == expected_format
-# yapf: disable
@pytest.mark.parametrize(
("template_path", "expected_format"),
- [("template_alpaca.jinja", "string"),
- ("template_baichuan.jinja", "string"),
- ("template_chatglm.jinja", "string"),
- ("template_chatglm2.jinja", "string"),
- ("template_chatml.jinja", "string"),
- ("template_dse_qwen2_vl.jinja", "openai"),
- ("template_falcon_180b.jinja", "string"),
- ("template_falcon.jinja", "string"),
- ("template_inkbot.jinja", "string"),
- ("template_teleflm.jinja", "string"),
- ("template_vlm2vec_phi3v.jinja", "openai"),
- ("template_vlm2vec_qwen2vl.jinja", "openai"),
- ("tool_chat_template_granite_20b_fc.jinja", "string"),
- ("tool_chat_template_hermes.jinja", "string"),
- ("tool_chat_template_internlm2_tool.jinja", "string"),
- ("tool_chat_template_llama3.1_json.jinja", "openai"),
- ("tool_chat_template_llama3.2_json.jinja", "openai"),
- ("tool_chat_template_mistral_parallel.jinja", "string"),
- ("tool_chat_template_mistral.jinja", "string")],
+ [
+ ("template_alpaca.jinja", "string"),
+ ("template_baichuan.jinja", "string"),
+ ("template_chatglm.jinja", "string"),
+ ("template_chatglm2.jinja", "string"),
+ ("template_chatml.jinja", "string"),
+ ("template_dse_qwen2_vl.jinja", "openai"),
+ ("template_falcon_180b.jinja", "string"),
+ ("template_falcon.jinja", "string"),
+ ("template_inkbot.jinja", "string"),
+ ("template_teleflm.jinja", "string"),
+ ("template_vlm2vec_phi3v.jinja", "openai"),
+ ("template_vlm2vec_qwen2vl.jinja", "openai"),
+ ("tool_chat_template_granite_20b_fc.jinja", "string"),
+ ("tool_chat_template_hermes.jinja", "string"),
+ ("tool_chat_template_internlm2_tool.jinja", "string"),
+ ("tool_chat_template_llama3.1_json.jinja", "openai"),
+ ("tool_chat_template_llama3.2_json.jinja", "openai"),
+ ("tool_chat_template_mistral_parallel.jinja", "string"),
+ ("tool_chat_template_mistral.jinja", "string"),
+ ],
)
-# yapf: enable
def test_resolve_content_format_examples(template_path, expected_format):
model_config = ModelConfig(
PHI3V_MODEL_ID, # Dummy
@@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format):
assert resolved_format == expected_format
-def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
- mistral_tokenizer):
- messages = [{
- "role":
- "system",
- "content": [{
- "type": "text",
- "text": "You are a helpful assistant."
- }, {
- "type":
- "thinking",
- "closed":
- True,
- "thinking":
- "Only return the answer when you are confident."
- }]
- }, {
- "role": "user",
- "content": "What is 2+2?"
- }, {
- "role":
- "assistant",
- "content": [{
- "type": "text",
- "text": "Let me think about it."
- }, {
- "type": "thinking",
- "closed": True,
- "thinking": "2+2 = 4"
- }, {
- "type": "text",
- "text": "The answer is 4.",
- }],
- }]
+def test_parse_chat_messages_include_thinking_chunk(
+ mistral_model_config, mistral_tokenizer
+):
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {"type": "text", "text": "You are a helpful assistant."},
+ {
+ "type": "thinking",
+ "closed": True,
+ "thinking": "Only return the answer when you are confident.",
+ },
+ ],
+ },
+ {"role": "user", "content": "What is 2+2?"},
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Let me think about it."},
+ {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
+ {
+ "type": "text",
+ "text": "The answer is 4.",
+ },
+ ],
+ },
+ ]
conversation_with_thinking, _, _ = parse_chat_messages(
messages,
@@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
content_format="openai",
)
- expected_conversation = [{
- "role":
- "system",
- "content": [{
- "type": "text",
- "text": "You are a helpful assistant."
- }, {
- "type": "text",
- "text": "Only return the answer when you are confident."
- }],
- }, {
- "role":
- "user",
- "content": [{
- "type": "text",
- "text": "What is 2+2?"
- }],
- }, {
- "role":
- "assistant",
- "content": [
- {
- "type": "text",
- "text": "Let me think about it."
- },
- {
- "type": "text",
- "text": "2+2 = 4"
- },
- {
- "type": "text",
- "text": "The answer is 4."
- },
- ]
- }]
+ expected_conversation = [
+ {
+ "role": "system",
+ "content": [
+ {"type": "text", "text": "You are a helpful assistant."},
+ {
+ "type": "text",
+ "text": "Only return the answer when you are confident.",
+ },
+ ],
+ },
+ {
+ "role": "user",
+ "content": [{"type": "text", "text": "What is 2+2?"}],
+ },
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Let me think about it."},
+ {"type": "text", "text": "2+2 = 4"},
+ {"type": "text", "text": "The answer is 4."},
+ ],
+ },
+ ]
assert conversation_with_thinking == expected_conversation
def test_apply_mistral_chat_template_thinking_chunk():
- # Moved import here to avoid yapf and isort conflicts
- from vllm.entrypoints.chat_utils import apply_mistral_chat_template
- messages = [{
- "role":
- "system",
- "content": [{
- "type": "text",
- "text": "You are a helpful assistant."
- }, {
- "type":
- "thinking",
- "closed":
- True,
- "thinking":
- "Only return the answer when you are confident."
- }]
- }, {
- "role": "user",
- "content": "What is 2+2?"
- }, {
- "role":
- "assistant",
- "content": [{
- "type": "text",
- "text": "Let me think about it."
- }, {
- "type": "thinking",
- "closed": True,
- "thinking": "2+2 = 4"
- }, {
- "type": "text",
- "text": "The answer is 4.",
- }],
- }, {
- "role": "user",
- "content": "Thanks, what is 3+3?"
- }]
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {"type": "text", "text": "You are a helpful assistant."},
+ {
+ "type": "thinking",
+ "closed": True,
+ "thinking": "Only return the answer when you are confident.",
+ },
+ ],
+ },
+ {"role": "user", "content": "What is 2+2?"},
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Let me think about it."},
+ {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
+ {
+ "type": "text",
+ "text": "The answer is 4.",
+ },
+ ],
+ },
+ {"role": "user", "content": "Thanks, what is 3+3?"},
+ ]
# TODO(Julien): upon model release change to a tokenizer already configured.
# =================================================================
mistral_tokenizer = MistralTokenizer.from_pretrained(
- "mistralai/Devstral-Small-2507")
+ "mistralai/Devstral-Small-2507"
+ )
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
# Add think special tokens to the tokenizer
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
- rank=35, is_control=True, token_str=SpecialTokens.begin_think.value)
+ rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
+ )
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
- rank=36, is_control=True, token_str=SpecialTokens.end_think.value)
+ rank=36, is_control=True, token_str=SpecialTokens.end_think.value
+ )
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
k: v
- for k, v in
- mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
+ for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
if v not in {35, 36}
}
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
- SpecialTokens.begin_think.value] = 35
+ SpecialTokens.begin_think.value
+ ] = 35
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
- SpecialTokens.end_think.value] = 36
+ SpecialTokens.end_think.value
+ ] = 36
mistral_tokenizer.instruct.BEGIN_THINK = 35
mistral_tokenizer.instruct.END_THINK = 36
# =================================================================
- tokens_ids = apply_mistral_chat_template(mistral_tokenizer,
- messages,
- chat_template=None,
- tools=None)
+ tokens_ids = apply_mistral_chat_template(
+ mistral_tokenizer, messages, chat_template=None, tools=None
+ )
string_tokens = mistral_tokenizer.mistral.decode(
- tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP)
+ tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
+ )
expected_tokens = (
r"[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
r"[INST]What is 2+2?[/INST]"
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4."
- r"[INST]Thanks, what is 3+3?[/INST]")
+ r"[INST]Thanks, what is 3+3?[/INST]"
+ )
assert string_tokens == expected_tokens
@@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
):
audio_uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
- [{
- "role":
- "user",
- "content": [
- {
- "type": "input_audio",
- "input_audio": {},
- "uuid": audio_uuid,
- },
- {
- "type": "text",
- "text": "What does the audio say?"
- },
- ],
- }],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "input_audio",
+ "input_audio": {},
+ "uuid": audio_uuid,
+ },
+ {"type": "text", "text": "What does the audio say?"},
+ ],
+ }
+ ],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
- assert conversation == [{
- "role":
- "user",
- "content":
- "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
- }]
+ assert conversation == [
+ {
+ "role": "user",
+ "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
+ }
+ ]
_assert_mm_data_inputs(mm_data, {"audio": 1})
- _assert_mm_uuids(mm_uuids,
- 1,
- modality="audio",
- expected_uuids=[audio_uuid])
+ _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
@pytest.mark.asyncio
@@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
):
audio_uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
- [{
- "role":
- "user",
- "content": [
- {
- "type": "input_audio",
- "input_audio": {},
- "uuid": audio_uuid,
- },
- {
- "type": "text",
- "text": "What does the audio say?"
- },
- ],
- }],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "input_audio",
+ "input_audio": {},
+ "uuid": audio_uuid,
+ },
+ {"type": "text", "text": "What does the audio say?"},
+ ],
+ }
+ ],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
- assert conversation == [{
- "role":
- "user",
- "content":
- "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
- }]
+ assert conversation == [
+ {
+ "role": "user",
+ "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
+ }
+ ]
_assert_mm_data_inputs(await mm_future, {"audio": 1})
- _assert_mm_uuids(mm_uuids,
- 1,
- modality="audio",
- expected_uuids=[audio_uuid])
+ _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 7c29a85298..695e06e7c1 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -12,9 +12,6 @@ import torch
import torch.nn.functional as F
from vllm.config.lora import LoRAConfig
-
-# yapf conflicts with isort for this block
-# yapf: disable
from vllm.lora.layers import (
BaseLayerWithLoRA,
ColumnParallelLinearWithLoRA,
@@ -32,8 +29,6 @@ from vllm.lora.layers import (
RowParallelLinearWithShardedLoRA,
VocabParallelEmbeddingWithLoRA,
)
-
-# yapf: enable
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.model_executor.layers.linear import (
diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
index ed86a6b8b1..57db1f98ba 100644
--- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
@@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
-
-# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig,
TensorSerializer,
@@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import (
from vllm.model_executor.model_loader.tensorizer_loader import (
BLACKLISTED_TENSORIZER_ARGS,
)
-
-# yapf: enable
from vllm.utils import PlaceholderModule
from .conftest import DummyExecutor, assert_from_collective_rpc
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 143d60fbf9..9168778a16 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -45,18 +45,17 @@ from .vlm_utils.types import (
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-# yapf: disable
COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE,
"dtype": "half",
"max_tokens": 5,
"tensor_parallel_size": 2,
"hf_model_kwargs": {"device_map": "auto"},
- "image_size_factors": [(.25, 0.5, 1.0)],
+ "image_size_factors": [(0.25, 0.5, 1.0)],
"distributed_executor_backend": (
"ray",
"mp",
- )
+ ),
}
### Test configuration for specific models
@@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI
"llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
- test_type=(
- VLMTestType.EMBEDDING,
- VLMTestType.IMAGE,
- VLMTestType.CUSTOM_INPUTS
- ),
+ test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
- custom_test_opts=[CustomTestOptions(
- inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
- formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
- ),
- limit_mm_per_prompt={"image": 4},
- )],
+ custom_test_opts=[
+ CustomTestOptions(
+ inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+ formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+ ),
+ limit_mm_per_prompt={"image": 4},
+ )
+ ],
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = {
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
- img_idx_to_prompt = lambda idx: "",
+ img_idx_to_prompt=lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "caption es",
- "cherry_blossom": "What is in the picture?",
- }),
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "caption es",
+ "cherry_blossom": "What is in the picture?",
+ }
+ ),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
- marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
+ marks=[
+ pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
+ ], # noqa: E501
),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
- test_type=(
- VLMTestType.IMAGE,
- VLMTestType.MULTI_IMAGE,
- VLMTestType.VIDEO
- ),
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
- video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+ video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = {
),
"qwen2_5_omni": VLMTestInfo(
models=["Qwen/Qwen2.5-Omni-3B"],
- test_type=(
- VLMTestType.IMAGE,
- VLMTestType.MULTI_IMAGE,
- VLMTestType.VIDEO
- ),
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
- video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+ video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
- num_logprobs= 6 if current_platform.is_cpu() else 5,
+ num_logprobs=6 if current_platform.is_cpu() else 5,
auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
@@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"ultravox": VLMTestInfo(
- models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
+ models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
test_type=VLMTestType.AUDIO,
- prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+ prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
audio_idx_to_prompt=lambda idx: "<|audio|>",
max_model_len=4096,
max_num_seqs=2,
@@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = {
"llava-onevision-transformers": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.IMAGE,
- prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
max_model_len=16384,
- hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
+ hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
+ "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+ ), # noqa: E501
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
@@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = {
"idefics3-transformers": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "",
max_model_len=8192,
max_num_seqs=2,
@@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = {
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE,
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = {
"model_impl": "transformers",
},
# FIXME: Investigate mrope issue
- marks=[large_gpu_mark(min_gb=32),
- pytest.mark.skip(reason="Mrope issue")],
+ marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")],
),
#### Extended model tests
"aria": VLMTestInfo(
models=["rhymes-ai/Aria"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<|img|>\n",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "Please describe the image shortly.",
- "cherry_blossom": "Please infer the season with reason.", # noqa: E501
- }),
- multi_image_prompt="Describe the two images shortly.", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "Please describe the image shortly.",
+ "cherry_blossom": "Please infer the season with reason.", # noqa: E501
+ }
+ ),
+ multi_image_prompt="Describe the two images shortly.", # noqa: E501
stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)],
max_tokens=64,
@@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = {
"aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.IMAGE),
- prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "What's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "What is the season?", # noqa: E501
- }),
+ prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "What is the season?", # noqa: E501
+ }
+ ),
multi_image_prompt="Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = {
"aya_vision-multi_image": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "What's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "What is the season?", # noqa: E501
- }),
+ prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "What is the season?", # noqa: E501
+ }
+ ),
multi_image_prompt="Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
# For chameleon, we only compare the sequences
- vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
- hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+ vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
+ hf_output_post_proc=lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal,
max_tokens=8,
dtype="bfloat16",
),
"deepseek_vl_v2": VLMTestInfo(
- models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
+ models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501
- }),
- multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501
+ }
+ ),
+ multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
- image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+ image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
),
"fuyu": VLMTestInfo(
models=["adept/fuyu-8b"],
@@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = {
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "What's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "What is the season?", # noqa: E501
- }),
+ prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "What is the season?", # noqa: E501
+ }
+ ),
multi_image_prompt="Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = {
models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
- }),
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
+ }
+ ),
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = {
models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
- img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
- video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+ img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
+ video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
- custom_test_opts=[CustomTestOptions(
- inputs=custom_inputs.video_with_metadata_glm4_1v(),
- limit_mm_per_prompt={"video": 1},
- )],
+ custom_test_opts=[
+ CustomTestOptions(
+ inputs=custom_inputs.video_with_metadata_glm4_1v(),
+ limit_mm_per_prompt={"video": 1},
+ )
+ ],
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(
- models = [
+ models=[
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "\nWhat is the season?",
- }),
+ prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "\nWhat is the season?",
+ }
+ ),
multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501
max_model_len=8192,
use_tokenizer_eos=True,
@@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = {
"idefics3": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "",
max_model_len=8192,
max_num_seqs=2,
@@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = {
# "OpenGVLab/Mono-InternVL-2B",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
- "cherry_blossom": "\nWhat is the season?",
- }),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts(
+ {
+ "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "\nWhat is the season?",
+ }
+ ),
multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501
max_model_len=4096,
use_tokenizer_eos=True,
@@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = {
"OpenGVLab/InternVL3-1B",
],
test_type=VLMTestType.VIDEO,
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
video_idx_to_prompt=lambda idx: "