diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index f7106f016b..d29a199c5d 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -12,9 +12,6 @@ from functools import reduce from typing import Optional, Union import jinja2 - -# yapf conflicts with isort for this block -# yapf: disable from vllm_cutlass_library_extension import ( DataType, EpilogueScheduleTag, @@ -31,8 +28,6 @@ from vllm_cutlass_library_extension import ( VLLMKernelScheduleTag, ) -# yapf: enable - # # Generator templating # diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 2b7f0beab2..acbfd8cda4 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser logger = logging.getLogger() -# yapf conflicts with isort for this docstring -# yapf: disable """ tensorize_vllm_model.py is a script that can be used to serialize and deserialize vLLM models. These models can be loaded using tensorizer @@ -132,7 +130,8 @@ def get_parser(): "can be loaded using tensorizer directly to the GPU " "extremely quickly. Tensor encryption and decryption is " "also supported, although libsodium must be installed to " - "use it.") + "use it." + ) parser = EngineArgs.add_cli_args(parser) parser.add_argument( @@ -144,13 +143,14 @@ def get_parser(): "along with the model by instantiating a TensorizerConfig object, " "creating a dict from it with TensorizerConfig.to_serializable(), " "and passing it to LoRARequest's initializer with the kwarg " - "tensorizer_config_dict." + "tensorizer_config_dict.", ) - subparsers = parser.add_subparsers(dest='command', required=True) + subparsers = parser.add_subparsers(dest="command", required=True) serialize_parser = subparsers.add_parser( - 'serialize', help="Serialize a model to `--serialized-directory`") + "serialize", help="Serialize a model to `--serialized-directory`" + ) serialize_parser.add_argument( "--suffix", @@ -163,7 +163,9 @@ def get_parser(): "`--suffix` is `v1`, the serialized model tensors will be " "saved to " "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. " - "If none is provided, a random UUID will be used.")) + "If none is provided, a random UUID will be used." + ), + ) serialize_parser.add_argument( "--serialized-directory", type=str, @@ -175,108 +177,127 @@ def get_parser(): "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will " "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, " "where `suffix` is given by `--suffix` or a random UUID if not " - "provided.") + "provided.", + ) serialize_parser.add_argument( "--serialization-kwargs", type=tensorizer_kwargs_arg, required=False, - help=("A JSON string containing additional keyword arguments to " - "pass to Tensorizer's TensorSerializer during " - "serialization.")) + help=( + "A JSON string containing additional keyword arguments to " + "pass to Tensorizer's TensorSerializer during " + "serialization." + ), + ) serialize_parser.add_argument( "--keyfile", type=str, required=False, - help=("Encrypt the model weights with a randomly-generated binary key," - " and save the key at this path")) + help=( + "Encrypt the model weights with a randomly-generated binary key," + " and save the key at this path" + ), + ) deserialize_parser = subparsers.add_parser( - 'deserialize', - help=("Deserialize a model from `--path-to-tensors`" - " to verify it can be loaded and used.")) + "deserialize", + help=( + "Deserialize a model from `--path-to-tensors`" + " to verify it can be loaded and used." + ), + ) deserialize_parser.add_argument( "--path-to-tensors", type=str, required=False, - help="The local path or S3 URI to the model tensors to deserialize. ") + help="The local path or S3 URI to the model tensors to deserialize. ", + ) deserialize_parser.add_argument( "--serialized-directory", type=str, required=False, help="Directory with model artifacts for loading. Assumes a " - "model.tensors file exists therein. Can supersede " - "--path-to-tensors.") + "model.tensors file exists therein. Can supersede " + "--path-to-tensors.", + ) deserialize_parser.add_argument( "--keyfile", type=str, required=False, - help=("Path to a binary key to use to decrypt the model weights," - " if the model was serialized with encryption")) + help=( + "Path to a binary key to use to decrypt the model weights," + " if the model was serialized with encryption" + ), + ) deserialize_parser.add_argument( "--deserialization-kwargs", type=tensorizer_kwargs_arg, required=False, - help=("A JSON string containing additional keyword arguments to " - "pass to Tensorizer's `TensorDeserializer` during " - "deserialization.")) + help=( + "A JSON string containing additional keyword arguments to " + "pass to Tensorizer's `TensorDeserializer` during " + "deserialization." + ), + ) TensorizerArgs.add_cli_args(deserialize_parser) return parser -def merge_extra_config_with_tensorizer_config(extra_cfg: dict, - cfg: TensorizerConfig): + +def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig): for k, v in extra_cfg.items(): if hasattr(cfg, k): setattr(cfg, k, v) logger.info( "Updating TensorizerConfig with %s from " - "--model-loader-extra-config provided", k + "--model-loader-extra-config provided", + k, ) + def deserialize(args, tensorizer_config): if args.lora_path: tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir - llm = LLM(model=args.model, - load_format="tensorizer", - tensor_parallel_size=args.tensor_parallel_size, - model_loader_extra_config=tensorizer_config, - enable_lora=True, + llm = LLM( + model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config, + enable_lora=True, ) sampling_params = SamplingParams( - temperature=0, - max_tokens=256, - stop=["[/assistant]"] + temperature=0, max_tokens=256, stop=["[/assistant]"] ) # Truncating this as the extra text isn't necessary - prompts = [ - "[user] Write a SQL query to answer the question based on ..." - ] + prompts = ["[user] Write a SQL query to answer the question based on ..."] # Test LoRA load print( llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql-lora", - 1, - args.lora_path, - tensorizer_config_dict = tensorizer_config - .to_serializable()) + prompts, + sampling_params, + lora_request=LoRARequest( + "sql-lora", + 1, + args.lora_path, + tensorizer_config_dict=tensorizer_config.to_serializable(), + ), ) ) else: - llm = LLM(model=args.model, - load_format="tensorizer", - tensor_parallel_size=args.tensor_parallel_size, - model_loader_extra_config=tensorizer_config + llm = LLM( + model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config, ) return llm @@ -285,17 +306,20 @@ def main(): parser = get_parser() args = parser.parse_args() - s3_access_key_id = (getattr(args, 's3_access_key_id', None) - or os.environ.get("S3_ACCESS_KEY_ID", None)) - s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) - or os.environ.get("S3_SECRET_ACCESS_KEY", None)) - s3_endpoint = (getattr(args, 's3_endpoint', None) - or os.environ.get("S3_ENDPOINT_URL", None)) + s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get( + "S3_ACCESS_KEY_ID", None + ) + s3_secret_access_key = getattr( + args, "s3_secret_access_key", None + ) or os.environ.get("S3_SECRET_ACCESS_KEY", None) + s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get( + "S3_ENDPOINT_URL", None + ) credentials = { "s3_access_key_id": s3_access_key_id, "s3_secret_access_key": s3_secret_access_key, - "s3_endpoint": s3_endpoint + "s3_endpoint": s3_endpoint, } model_ref = args.model @@ -309,25 +333,25 @@ def main(): if args.model_loader_extra_config: extra_config = json.loads(args.model_loader_extra_config) - - tensorizer_dir = (args.serialized_directory or - extra_config.get("tensorizer_dir")) - tensorizer_uri = (getattr(args, "path_to_tensors", None) - or extra_config.get("tensorizer_uri")) + tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir") + tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get( + "tensorizer_uri" + ) if tensorizer_dir and tensorizer_uri: - parser.error("--serialized-directory and --path-to-tensors " - "cannot both be provided") + parser.error( + "--serialized-directory and --path-to-tensors cannot both be provided" + ) if not tensorizer_dir and not tensorizer_uri: - parser.error("Either --serialized-directory or --path-to-tensors " - "must be provided") - + parser.error( + "Either --serialized-directory or --path-to-tensors must be provided" + ) if args.command == "serialize": engine_args = EngineArgs.from_cli_args(args) - input_dir = tensorizer_dir.rstrip('/') + input_dir = tensorizer_dir.rstrip("/") suffix = args.suffix if args.suffix else uuid.uuid4().hex base_path = f"{input_dir}/vllm/{model_ref}/{suffix}" if engine_args.tensor_parallel_size > 1: @@ -339,15 +363,14 @@ def main(): tensorizer_uri=model_path, encryption_keyfile=keyfile, serialization_kwargs=args.serialization_kwargs or {}, - **credentials + **credentials, ) if args.lora_path: tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir tensorize_lora_adapter(args.lora_path, tensorizer_config) - merge_extra_config_with_tensorizer_config(extra_config, - tensorizer_config) + merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config) tensorize_vllm_model(engine_args, tensorizer_config) elif args.command == "deserialize": @@ -356,11 +379,10 @@ def main(): tensorizer_dir=args.serialized_directory, encryption_keyfile=keyfile, deserialization_kwargs=args.deserialization_kwargs or {}, - **credentials + **credentials, ) - merge_extra_config_with_tensorizer_config(extra_config, - tensorizer_config) + merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config) deserialize(args, tensorizer_config) else: raise ValueError("Either serialize or deserialize must be specified.") diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 7e3a230b5f..16a4271655 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -8,16 +8,11 @@ import torch import vllm.envs as envs from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant - -# yapf conflicts with isort for this block -# yapf: disable from vllm.compilation.activation_quant_fusion import ( FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass, ) - -# yapf: enable from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index 68ac52af30..94f0ece497 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -107,10 +107,8 @@ class EPTestSettings: # NOTE: You can adjust tp_base locally to fit the model in GPU # The values displayed here are only a rough indicator of the size of the model -# yapf: disable TEST_MODELS = { - "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast( - trust_remote_code=True), + "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True), "mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4), } @@ -192,22 +190,24 @@ def _compare_tp( ] try: - compare_two_settings(model_name, - ep_args, - tp_args, - ep_env, - tp_env, - method=method, - max_wait_seconds=360) + compare_two_settings( + model_name, + ep_args, + tp_args, + ep_env, + tp_env, + method=method, + max_wait_seconds=360, + ) except Exception: raise @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", "runner", - "test_options"), + ("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"), [ - params for model_name, settings in TEST_MODELS.items() + params + for model_name, settings in TEST_MODELS.items() for params in settings.iter_params(model_name) ], ) @@ -220,10 +220,12 @@ def test_ep( test_options: EPTestOptions, num_gpus_available, ): - _compare_tp(model_name, - parallel_setup, - distributed_backend, - runner, - test_options, - num_gpus_available, - method="generate") + _compare_tp( + model_name, + parallel_setup, + distributed_backend, + runner, + test_options, + num_gpus_available, + method="generate", + ) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 26ee60c150..119e8e7621 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -100,7 +100,6 @@ class PPTestSettings: # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # The values displayed here are only a rough indicator of the size of the model -# yapf: disable TEXT_GENERATION_MODELS = { # [Decoder-only] # Uses Llama @@ -150,7 +149,9 @@ TEXT_GENERATION_MODELS = { "adept/persimmon-8b-chat": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(), "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(), - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed( + multi_node_only=True, load_format="dummy" + ), # noqa: E501 "Qwen/Qwen-7B-Chat": PPTestSettings.fast(), "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), @@ -196,7 +197,6 @@ MULTIMODAL_MODELS = { "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(), } -# yapf: enable # NOTE: You can update this on your local machine to run specific tests TEST_MODELS = [ diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index c0eb0e5ac5..9d367349fc 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -287,29 +287,15 @@ def test_prefix_cache_default(): assert not engine_args.enable_prefix_caching -# yapf: disable -@pytest.mark.parametrize(("arg", "expected", "option"), [ - (None, None, "mm-processor-kwargs"), - ("{}", {}, "mm-processor-kwargs"), - ( - '{"num_crops": 4}', - { - "num_crops": 4 - }, - "mm-processor-kwargs" - ), - ( - '{"foo": {"bar": "baz"}}', - { - "foo": - { - "bar": "baz" - } - }, - "mm-processor-kwargs" - ), -]) -# yapf: enable +@pytest.mark.parametrize( + ("arg", "expected", "option"), + [ + (None, None, "mm-processor-kwargs"), + ("{}", {}, "mm-processor-kwargs"), + ('{"num_crops": 4}', {"num_crops": 4}, "mm-processor-kwargs"), + ('{"foo": {"bar": "baz"}}', {"foo": {"bar": "baz"}}, "mm-processor-kwargs"), + ], +) def test_composite_arg_parser(arg, expected, option): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) if arg is None: @@ -321,8 +307,7 @@ def test_composite_arg_parser(arg, expected, option): def test_human_readable_model_len(): # `exit_on_error` disabled to test invalid values below - parser = EngineArgs.add_cli_args( - FlexibleArgumentParser(exit_on_error=False)) + parser = EngineArgs.add_cli_args(FlexibleArgumentParser(exit_on_error=False)) args = parser.parse_args([]) assert args.max_model_len is None diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 7ddad4d513..975ca53a3a 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -15,6 +15,7 @@ from vllm.assets.video import VideoAsset from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import ( _try_extract_ast, + apply_mistral_chat_template, load_chat_template, parse_chat_messages, parse_chat_messages_futures, @@ -1855,17 +1856,17 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa # NOTE: Qwen2-Audio default chat template is specially defined inside # processor class instead of using `tokenizer_config.json` -# yapf: disable @pytest.mark.parametrize( ("model", "expected_format"), - [(PHI3V_MODEL_ID, "string"), - (QWEN2VL_MODEL_ID, "openai"), - (QWEN25VL_MODEL_ID, "openai"), - (ULTRAVOX_MODEL_ID, "string"), - (QWEN2AUDIO_MODEL_ID, "openai"), - (LLAMA_GUARD_MODEL_ID, "openai")], + [ + (PHI3V_MODEL_ID, "string"), + (QWEN2VL_MODEL_ID, "openai"), + (QWEN25VL_MODEL_ID, "openai"), + (ULTRAVOX_MODEL_ID, "string"), + (QWEN2AUDIO_MODEL_ID, "openai"), + (LLAMA_GUARD_MODEL_ID, "openai"), + ], ) -# yapf: enable def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") @@ -1879,7 +1880,8 @@ def test_resolve_content_format_hf_defined(model, expected_format): hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.skip_tokenizer_init, enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype) + dtype=model_info.dtype, + ) tokenizer = get_tokenizer( model, @@ -1911,18 +1913,18 @@ def test_resolve_content_format_hf_defined(model, expected_format): assert resolved_format == expected_format -# yapf: disable @pytest.mark.parametrize( ("model", "expected_format"), - [("Salesforce/blip2-opt-2.7b", "string"), - ("facebook/chameleon-7b", "string"), - ("deepseek-ai/deepseek-vl2-tiny", "string"), - ("adept/fuyu-8b", "string"), - ("google/paligemma-3b-mix-224", "string"), - ("Qwen/Qwen-VL", "string"), - ("Qwen/Qwen-VL-Chat", "string")], + [ + ("Salesforce/blip2-opt-2.7b", "string"), + ("facebook/chameleon-7b", "string"), + ("deepseek-ai/deepseek-vl2-tiny", "string"), + ("adept/fuyu-8b", "string"), + ("google/paligemma-3b-mix-224", "string"), + ("Qwen/Qwen-VL", "string"), + ("Qwen/Qwen-VL-Chat", "string"), + ], ) -# yapf: enable def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") @@ -1936,7 +1938,8 @@ def test_resolve_content_format_fallbacks(model, expected_format): hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.skip_tokenizer_init, enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype) + dtype=model_info.dtype, + ) tokenizer = get_tokenizer( model_config.tokenizer, @@ -1968,30 +1971,30 @@ def test_resolve_content_format_fallbacks(model, expected_format): assert resolved_format == expected_format -# yapf: disable @pytest.mark.parametrize( ("template_path", "expected_format"), - [("template_alpaca.jinja", "string"), - ("template_baichuan.jinja", "string"), - ("template_chatglm.jinja", "string"), - ("template_chatglm2.jinja", "string"), - ("template_chatml.jinja", "string"), - ("template_dse_qwen2_vl.jinja", "openai"), - ("template_falcon_180b.jinja", "string"), - ("template_falcon.jinja", "string"), - ("template_inkbot.jinja", "string"), - ("template_teleflm.jinja", "string"), - ("template_vlm2vec_phi3v.jinja", "openai"), - ("template_vlm2vec_qwen2vl.jinja", "openai"), - ("tool_chat_template_granite_20b_fc.jinja", "string"), - ("tool_chat_template_hermes.jinja", "string"), - ("tool_chat_template_internlm2_tool.jinja", "string"), - ("tool_chat_template_llama3.1_json.jinja", "openai"), - ("tool_chat_template_llama3.2_json.jinja", "openai"), - ("tool_chat_template_mistral_parallel.jinja", "string"), - ("tool_chat_template_mistral.jinja", "string")], + [ + ("template_alpaca.jinja", "string"), + ("template_baichuan.jinja", "string"), + ("template_chatglm.jinja", "string"), + ("template_chatglm2.jinja", "string"), + ("template_chatml.jinja", "string"), + ("template_dse_qwen2_vl.jinja", "openai"), + ("template_falcon_180b.jinja", "string"), + ("template_falcon.jinja", "string"), + ("template_inkbot.jinja", "string"), + ("template_teleflm.jinja", "string"), + ("template_vlm2vec_phi3v.jinja", "openai"), + ("template_vlm2vec_qwen2vl.jinja", "openai"), + ("tool_chat_template_granite_20b_fc.jinja", "string"), + ("tool_chat_template_hermes.jinja", "string"), + ("tool_chat_template_internlm2_tool.jinja", "string"), + ("tool_chat_template_llama3.1_json.jinja", "openai"), + ("tool_chat_template_llama3.2_json.jinja", "openai"), + ("tool_chat_template_mistral_parallel.jinja", "string"), + ("tool_chat_template_mistral.jinja", "string"), + ], ) -# yapf: enable def test_resolve_content_format_examples(template_path, expected_format): model_config = ModelConfig( PHI3V_MODEL_ID, # Dummy @@ -2024,40 +2027,34 @@ def test_resolve_content_format_examples(template_path, expected_format): assert resolved_format == expected_format -def test_parse_chat_messages_include_thinking_chunk(mistral_model_config, - mistral_tokenizer): - messages = [{ - "role": - "system", - "content": [{ - "type": "text", - "text": "You are a helpful assistant." - }, { - "type": - "thinking", - "closed": - True, - "thinking": - "Only return the answer when you are confident." - }] - }, { - "role": "user", - "content": "What is 2+2?" - }, { - "role": - "assistant", - "content": [{ - "type": "text", - "text": "Let me think about it." - }, { - "type": "thinking", - "closed": True, - "thinking": "2+2 = 4" - }, { - "type": "text", - "text": "The answer is 4.", - }], - }] +def test_parse_chat_messages_include_thinking_chunk( + mistral_model_config, mistral_tokenizer +): + messages = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are a helpful assistant."}, + { + "type": "thinking", + "closed": True, + "thinking": "Only return the answer when you are confident.", + }, + ], + }, + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + {"type": "thinking", "closed": True, "thinking": "2+2 = 4"}, + { + "type": "text", + "text": "The answer is 4.", + }, + ], + }, + ] conversation_with_thinking, _, _ = parse_chat_messages( messages, @@ -2066,122 +2063,105 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config, content_format="openai", ) - expected_conversation = [{ - "role": - "system", - "content": [{ - "type": "text", - "text": "You are a helpful assistant." - }, { - "type": "text", - "text": "Only return the answer when you are confident." - }], - }, { - "role": - "user", - "content": [{ - "type": "text", - "text": "What is 2+2?" - }], - }, { - "role": - "assistant", - "content": [ - { - "type": "text", - "text": "Let me think about it." - }, - { - "type": "text", - "text": "2+2 = 4" - }, - { - "type": "text", - "text": "The answer is 4." - }, - ] - }] + expected_conversation = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are a helpful assistant."}, + { + "type": "text", + "text": "Only return the answer when you are confident.", + }, + ], + }, + { + "role": "user", + "content": [{"type": "text", "text": "What is 2+2?"}], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + {"type": "text", "text": "2+2 = 4"}, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ] assert conversation_with_thinking == expected_conversation def test_apply_mistral_chat_template_thinking_chunk(): - # Moved import here to avoid yapf and isort conflicts - from vllm.entrypoints.chat_utils import apply_mistral_chat_template - messages = [{ - "role": - "system", - "content": [{ - "type": "text", - "text": "You are a helpful assistant." - }, { - "type": - "thinking", - "closed": - True, - "thinking": - "Only return the answer when you are confident." - }] - }, { - "role": "user", - "content": "What is 2+2?" - }, { - "role": - "assistant", - "content": [{ - "type": "text", - "text": "Let me think about it." - }, { - "type": "thinking", - "closed": True, - "thinking": "2+2 = 4" - }, { - "type": "text", - "text": "The answer is 4.", - }], - }, { - "role": "user", - "content": "Thanks, what is 3+3?" - }] + messages = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "You are a helpful assistant."}, + { + "type": "thinking", + "closed": True, + "thinking": "Only return the answer when you are confident.", + }, + ], + }, + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + {"type": "thinking", "closed": True, "thinking": "2+2 = 4"}, + { + "type": "text", + "text": "The answer is 4.", + }, + ], + }, + {"role": "user", "content": "Thanks, what is 3+3?"}, + ] # TODO(Julien): upon model release change to a tokenizer already configured. # ================================================================= mistral_tokenizer = MistralTokenizer.from_pretrained( - "mistralai/Devstral-Small-2507") + "mistralai/Devstral-Small-2507" + ) assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer) # Add think special tokens to the tokenizer mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo( - rank=35, is_control=True, token_str=SpecialTokens.begin_think.value) + rank=35, is_control=True, token_str=SpecialTokens.begin_think.value + ) mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo( - rank=36, is_control=True, token_str=SpecialTokens.end_think.value) + rank=36, is_control=True, token_str=SpecialTokens.end_think.value + ) mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = { k: v - for k, v in - mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items() + for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items() if v not in {35, 36} } mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[ - SpecialTokens.begin_think.value] = 35 + SpecialTokens.begin_think.value + ] = 35 mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[ - SpecialTokens.end_think.value] = 36 + SpecialTokens.end_think.value + ] = 36 mistral_tokenizer.instruct.BEGIN_THINK = 35 mistral_tokenizer.instruct.END_THINK = 36 # ================================================================= - tokens_ids = apply_mistral_chat_template(mistral_tokenizer, - messages, - chat_template=None, - tools=None) + tokens_ids = apply_mistral_chat_template( + mistral_tokenizer, messages, chat_template=None, tools=None + ) string_tokens = mistral_tokenizer.mistral.decode( - tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP) + tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP + ) expected_tokens = ( r"[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the" r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]" r"[INST]What is 2+2?[/INST]" r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4." - r"[INST]Thanks, what is 3+3?[/INST]") + r"[INST]Thanks, what is 3+3?[/INST]" + ) assert string_tokens == expected_tokens @@ -2192,37 +2172,32 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ): audio_uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "input_audio", - "input_audio": {}, - "uuid": audio_uuid, - }, - { - "type": "text", - "text": "What does the audio say?" - }, - ], - }], + [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + {"type": "text", "text": "What does the audio say?"}, + ], + } + ], qwen2_audio_model_config, qwen2_audio_tokenizer, content_format="string", ) - assert conversation == [{ - "role": - "user", - "content": - "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" - }] + assert conversation == [ + { + "role": "user", + "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", + } + ] _assert_mm_data_inputs(mm_data, {"audio": 1}) - _assert_mm_uuids(mm_uuids, - 1, - modality="audio", - expected_uuids=[audio_uuid]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) @pytest.mark.asyncio @@ -2232,34 +2207,29 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ): audio_uuid = "abcd" conversation, mm_future, mm_uuids = parse_chat_messages_futures( - [{ - "role": - "user", - "content": [ - { - "type": "input_audio", - "input_audio": {}, - "uuid": audio_uuid, - }, - { - "type": "text", - "text": "What does the audio say?" - }, - ], - }], + [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + {"type": "text", "text": "What does the audio say?"}, + ], + } + ], qwen2_audio_model_config, qwen2_audio_tokenizer, content_format="string", ) - assert conversation == [{ - "role": - "user", - "content": - "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" - }] + assert conversation == [ + { + "role": "user", + "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", + } + ] _assert_mm_data_inputs(await mm_future, {"audio": 1}) - _assert_mm_uuids(mm_uuids, - 1, - modality="audio", - expected_uuids=[audio_uuid]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 7c29a85298..695e06e7c1 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -12,9 +12,6 @@ import torch import torch.nn.functional as F from vllm.config.lora import LoRAConfig - -# yapf conflicts with isort for this block -# yapf: disable from vllm.lora.layers import ( BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, @@ -32,8 +29,6 @@ from vllm.lora.layers import ( RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) - -# yapf: enable from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.model_executor.layers.linear import ( diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py index ed86a6b8b1..57db1f98ba 100644 --- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py +++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py @@ -17,8 +17,6 @@ import vllm.model_executor.model_loader.tensorizer from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs - -# yapf: disable from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, TensorSerializer, @@ -29,8 +27,6 @@ from vllm.model_executor.model_loader.tensorizer import ( from vllm.model_executor.model_loader.tensorizer_loader import ( BLACKLISTED_TENSORIZER_ARGS, ) - -# yapf: enable from vllm.utils import PlaceholderModule from .conftest import DummyExecutor, assert_from_collective_rpc diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 143d60fbf9..9168778a16 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -45,18 +45,17 @@ from .vlm_utils.types import ( if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" -# yapf: disable COMMON_BROADCAST_SETTINGS = { "test_type": VLMTestType.IMAGE, "dtype": "half", "max_tokens": 5, "tensor_parallel_size": 2, "hf_model_kwargs": {"device_map": "auto"}, - "image_size_factors": [(.25, 0.5, 1.0)], + "image_size_factors": [(0.25, 0.5, 1.0)], "distributed_executor_backend": ( "ray", "mp", - ) + ), } ### Test configuration for specific models @@ -96,22 +95,20 @@ VLM_TEST_SETTINGS = { #### Core tests to always run in the CI "llava": VLMTestInfo( models=["llava-hf/llava-1.5-7b-hf"], - test_type=( - VLMTestType.EMBEDDING, - VLMTestType.IMAGE, - VLMTestType.CUSTOM_INPUTS - ), + test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", convert_assets_to_embeddings=model_utils.get_llava_embeddings, max_model_len=4096, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( - formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" - ), - limit_mm_per_prompt={"image": 4}, - )], + custom_test_opts=[ + CustomTestOptions( + inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( + formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" + ), + limit_mm_per_prompt={"image": 4}, + ) + ], # TODO: Revert to "auto" when CPU backend can use torch > 2.6 dtype="bfloat16" if current_platform.is_cpu() else "auto", marks=[pytest.mark.core_model, pytest.mark.cpu_model], @@ -120,27 +117,27 @@ VLM_TEST_SETTINGS = { models=["google/paligemma-3b-mix-224"], test_type=VLMTestType.IMAGE, prompt_formatter=identity, - img_idx_to_prompt = lambda idx: "", + img_idx_to_prompt=lambda idx: "", # Paligemma uses its own sample prompts because the default one fails - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "caption es", - "cherry_blossom": "What is in the picture?", - }), + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "caption es", + "cherry_blossom": "What is in the picture?", + } + ), auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, dtype="bfloat16", - marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501 + marks=[ + pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") + ], # noqa: E501 ), "qwen2_5_vl": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - VLMTestType.VIDEO - ), - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 - video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, @@ -150,17 +147,13 @@ VLM_TEST_SETTINGS = { ), "qwen2_5_omni": VLMTestInfo( models=["Qwen/Qwen2.5-Omni-3B"], - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - VLMTestType.VIDEO - ), - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501 - video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, - num_logprobs= 6 if current_platform.is_cpu() else 5, + num_logprobs=6 if current_platform.is_cpu() else 5, auto_cls=AutoModelForTextToWaveform, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, @@ -168,9 +161,9 @@ VLM_TEST_SETTINGS = { marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "ultravox": VLMTestInfo( - models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"], + models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"], test_type=VLMTestType.AUDIO, - prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 audio_idx_to_prompt=lambda idx: "<|audio|>", max_model_len=4096, max_num_seqs=2, @@ -184,9 +177,11 @@ VLM_TEST_SETTINGS = { "llava-onevision-transformers": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.IMAGE, - prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 max_model_len=16384, - hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" + ), # noqa: E501 auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, image_size_factors=[(0.25, 0.5, 1.0)], @@ -201,7 +196,7 @@ VLM_TEST_SETTINGS = { "idefics3-transformers": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, @@ -217,8 +212,8 @@ VLM_TEST_SETTINGS = { "qwen2_5_vl-transformers": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], test_type=VLMTestType.IMAGE, - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, @@ -228,23 +223,24 @@ VLM_TEST_SETTINGS = { "model_impl": "transformers", }, # FIXME: Investigate mrope issue - marks=[large_gpu_mark(min_gb=32), - pytest.mark.skip(reason="Mrope issue")], + marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")], ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "Please describe the image shortly.", - "cherry_blossom": "Please infer the season with reason.", # noqa: E501 - }), - multi_image_prompt="Describe the two images shortly.", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "Please describe the image shortly.", + "cherry_blossom": "Please infer the season with reason.", # noqa: E501 + } + ), + multi_image_prompt="Describe the two images shortly.", # noqa: E501 stop_str=["<|im_end|>"], image_size_factors=[(0.10, 0.15)], max_tokens=64, @@ -253,11 +249,13 @@ VLM_TEST_SETTINGS = { "aya_vision": VLMTestInfo( models=["CohereForAI/aya-vision-8b"], test_type=(VLMTestType.IMAGE), - prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "What's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "What is the season?", # noqa: E501 - }), + prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "What's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "What is the season?", # noqa: E501 + } + ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, @@ -267,11 +265,13 @@ VLM_TEST_SETTINGS = { "aya_vision-multi_image": VLMTestInfo( models=["CohereForAI/aya-vision-8b"], test_type=(VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "What's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "What is the season?", # noqa: E501 - }), + prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "What's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "What is the season?", # noqa: E501 + } + ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, @@ -297,27 +297,29 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, # For chameleon, we only compare the sequences - vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], - hf_output_post_proc = lambda hf_output, model: hf_output[:2], + vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2], + hf_output_post_proc=lambda hf_output, model: hf_output[:2], comparator=check_outputs_equal, max_tokens=8, dtype="bfloat16", ), "deepseek_vl_v2": VLMTestInfo( - models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 max_model_len=4096, max_num_seqs=2, - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 - }), - multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 + } + ), + multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 - image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], + image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], @@ -336,11 +338,13 @@ VLM_TEST_SETTINGS = { "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "What's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "What is the season?", # noqa: E501 - }), + prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "What's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "What is the season?", # noqa: E501 + } + ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, @@ -353,10 +357,12 @@ VLM_TEST_SETTINGS = { models=["zai-org/glm-4v-9b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501 - }), + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501 + } + ), max_model_len=2048, max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], @@ -372,8 +378,8 @@ VLM_TEST_SETTINGS = { models=["zai-org/GLM-4.1V-9B-Thinking"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 - video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 max_model_len=2048, max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], @@ -390,23 +396,27 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, patch_hf_runner=model_utils.glm4_1v_patch_hf_runner, - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.video_with_metadata_glm4_1v(), - limit_mm_per_prompt={"video": 1}, - )], + custom_test_opts=[ + CustomTestOptions( + inputs=custom_inputs.video_with_metadata_glm4_1v(), + limit_mm_per_prompt={"video": 1}, + ) + ], marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( - models = [ + models=[ "h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-2b", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "\nWhat is the season?", - }), + prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nWhat is the season?", + } + ), multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501 max_model_len=8192, use_tokenizer_eos=True, @@ -416,7 +426,7 @@ VLM_TEST_SETTINGS = { "idefics3": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, @@ -431,11 +441,13 @@ VLM_TEST_SETTINGS = { # "OpenGVLab/Mono-InternVL-2B", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 - "cherry_blossom": "\nWhat is the season?", - }), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nWhat is the season?", + } + ), multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501 max_model_len=4096, use_tokenizer_eos=True, @@ -446,7 +458,7 @@ VLM_TEST_SETTINGS = { "OpenGVLab/InternVL3-1B", ], test_type=VLMTestType.VIDEO, - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 video_idx_to_prompt=lambda idx: "