Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
        pytest.param(
            "google/gemma-1.1-2b-it",  # gemma
            marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
            ],
        ),
        pytest.param(
@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
        pytest.param(
            "openbmb/MiniCPM3-4B",
            # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model,
-                   large_gpu_mark(min_gb=32)],
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
        ),
        pytest.param(
            "facebook/opt-125m",  # opt
@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
        pytest.param(
            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
            marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
            ],
        ),
        pytest.param(
@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
            marks=[pytest.mark.cpu_model],
        ),
        pytest.param("swiss-ai/Apertus-8B-2509"),  # apertus
-    ])
+    ],
+)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
-                use_prompt_embeds: bool, monkeypatch) -> None:
-
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    use_rocm_aiter: bool,
+    use_prompt_embeds: bool,
+    monkeypatch,
+) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
-                                                       else None)
+        prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None

        prompt_token_ids = []
        for prompt in example_prompts:
-            token_ids = hf_model.tokenizer(prompt,
-                                           return_tensors="pt").input_ids.to(
-                                               hf_model.model.device)
+            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
+                hf_model.model.device
+            )
            prompt_token_ids.append(token_ids)
            if prompt_embeds is not None:
-                prompt_embeds.append(hf_model.model.get_input_embeddings()(
-                    token_ids).squeeze(0))
+                prompt_embeds.append(
+                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
+                )

    with vllm_runner(
-            model,
-            tokenizer_name=model_info.tokenizer or model,
-            tokenizer_mode=model_info.tokenizer_mode,
-            trust_remote_code=model_info.trust_remote_code,
-            max_num_seqs=2,
-            enable_prompt_embeds=use_prompt_embeds,
+        model,
+        tokenizer_name=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        max_num_seqs=2,
+        enable_prompt_embeds=use_prompt_embeds,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
        if prompt_embeds is not None:
            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
-                prompt_embeds, max_tokens, num_logprobs)
+                prompt_embeds, max_tokens, num_logprobs
+            )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
    with monkeypatch.context() as m:
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
        with vllm_runner(
-                model,
-                load_format="dummy",
+            model,
+            load_format="dummy",
        ) as llm:
            if model == "google/gemma-3-4b-it":
                normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.language_model.model.
-                    normalizer.cpu().item())
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
+                )
                config = llm.llm.llm_engine.model_config.hf_config.text_config
            else:
                normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.model.normalizer.cpu(
-                    ).item())
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
                config = llm.llm.llm_engine.model_config.hf_config
            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@ -26,11 +26,13 @@ def test_models(
 ) -> None:
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -24,7 +24,7 @@ SSM_MODELS = [
    "tiiuae/falcon-mamba-tiny-dev",
    # mamba2-codestral in transformers is broken pending:
    # https://github.com/huggingface/transformers/pull/40861
-    #"yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # "yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]

 HYBRID_MODELS = [
@ -65,7 +65,6 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -75,11 +74,13 @@ def test_models(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@ -109,13 +110,14 @@ def test_batching(
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
-            single_output, = vllm_model.generate_greedy_logprobs([prompt],
-                                                                 max_tokens,
-                                                                 num_logprobs)
+            (single_output,) = vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs
+            )
            for_loop_outputs.append(single_output)

        batched_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=for_loop_outputs,
@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
    max_tokens: int,
 ) -> None:
    """
-    Tests chunked prefill in conjunction with n > 1. 
-    
+    Tests chunked prefill in conjunction with n > 1.
+
    In this case, prefill is populated with decoding tokens and
    we test that it doesn't fail.

@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
    decoding steps inside a chunked prefill forward pass
    (where we have both prefill and decode together)
    """
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
+    sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
    with vllm_runner(
-            model,
-            enable_chunked_prefill=True,
-            # forces prefill chunks with decoding
-            max_num_batched_tokens=MAX_NUM_SEQS * 3,
-            max_num_seqs=MAX_NUM_SEQS,
+        model,
+        enable_chunked_prefill=True,
+        # forces prefill chunks with decoding
+        max_num_batched_tokens=MAX_NUM_SEQS * 3,
+        max_num_seqs=MAX_NUM_SEQS,
    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)

@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
    batch size. If it's not, a torch RuntimeError will be raised because
    tensor dimensions aren't compatible.
    """
-    vllm_config = EngineArgs(model=model,
-                             trust_remote_code=True).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
+    vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
        example_prompts.append(example_prompts[0])

    try:
@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
        pytest.fail(
            "Couldn't run batch size which is not equal to a Cuda Graph "
            "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
+            "Could be related to mamba cache not padded correctly"
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up properly between"
+            "steps finished requests registered unnecessarily "
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -215,10 +215,10 @@ def test_state_cleanup(
    example_prompts,
    model: str,
 ) -> None:
-    """ 
+    """
    This test is for verifying that the Hybrid state is cleaned up between
    steps.
-    
+
    If it's not cleaned, an error would be expected.
    """
    try:
@ -226,8 +226,10 @@ def test_state_cleanup(
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up between states, "
+            "could be related to finished_requests_ids"
+        )


@multi_gpu_test(num_gpus=2)
@ -241,15 +243,19 @@ def test_distributed_correctness(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=1,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model, tensor_parallel_size=2,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=vllm_outputs_tp_1,
@ -271,7 +277,6 @@ def test_full_cuda_graph(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -281,11 +286,13 @@ def test_full_cuda_graph(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@ -298,8 +305,9 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("cache_dtype_param",
-                         ["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
+@pytest.mark.parametrize(
+    "cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
+)
 def test_fp32_cache_state(
    hf_runner,
    vllm_runner,
@ -310,7 +318,6 @@ def test_fp32_cache_state(
    num_logprobs: int,
    cache_dtype_param: str,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -320,13 +327,15 @@ def test_fp32_cache_state(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model,
-                     max_num_seqs=MAX_NUM_SEQS,
-                     **{cache_dtype_param: "float32"}) as vllm_model:
+    with vllm_runner(
+        model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@ -339,21 +348,23 @@ def test_fp32_cache_state(
 # Helper functions for the APC tests
 def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
    return {
-        'model_name': model,
-        'enable_prefix_caching': False,
-        'max_model_len': max_model_len,
-        'tensor_parallel_size': tensor_parallel_size,
-        'gpu_memory_utilization': 0.4
+        "model_name": model,
+        "enable_prefix_caching": False,
+        "max_model_len": max_model_len,
+        "tensor_parallel_size": tensor_parallel_size,
+        "gpu_memory_utilization": 0.4,
    }


-def _get_vLLM_output(vllm_runner,
-                     kwargs,
-                     prompts,
-                     max_tokens,
-                     num_logprobs,
-                     num_repetitions=1,
-                     vllm_model=None):
+def _get_vLLM_output(
+    vllm_runner,
+    kwargs,
+    prompts,
+    max_tokens,
+    num_logprobs,
+    num_repetitions=1,
+    vllm_model=None,
+):
    outs = []
    if vllm_model is None:
        vllm_model = vllm_runner(**kwargs)
@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
        else:
            vllm_output = vllm_model.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs)
+                prompts, max_tokens, num_logprobs
+            )
        outs.append(vllm_output)

    return outs, vllm_model
@ -387,7 +399,6 @@ def test_apc_single_prompt(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -395,29 +406,33 @@ def test_apc_single_prompt(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-          if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * example_prompts[0]]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts, max_tokens,
-                                                 num_logprobs, n_repetitions)
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-                    if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts. This custom prompt is used, as it causes the most issues
    generated_prompts = ["The president of the United States is " * MULTIPLE]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
        # Retrieve the default mamba state block size
-        mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
-            mamba_block_size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size

    # In case the hybrid model does not have the
    # "mamba_block_size" assume a fixed constant
@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
        mamba_block_size = 512

    mamba_block_size_multiplier = 10
-    for offsets in [
-            -3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
-    ]:
-
-        vllm_runner_kwargs[
-            'max_num_batched_tokens'] = mamba_block_size_multiplier * \
-                                        mamba_block_size - offsets
-        vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                     vllm_runner_kwargs,
-                                                     generated_prompts,
-                                                     max_tokens, num_logprobs,
-                                                     n_repetitions)
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )

        # Check alignment of the output logits when using APC
        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts, max_tokens,
-                                                 num_logprobs, n_repetitions)
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts. This custom prompt is used, as it causes the most issues
    prompt_text = "The president of the United States is "
    prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
-    generated_prompts = [
-        prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
-    ]
+    generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
-    vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
-                                                 tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
        # Retrieve the default mamba state block size
-        mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
-            mamba_block_size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size

    # In case the hybrid model does not have the
    # "mamba_block_size" assume a fixed constant
@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
        mamba_block_size = 512

    mamba_block_size_multiplier = 10
-    for offsets in [
-            -3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
-    ]:
-
-        vllm_runner_kwargs[
-            'max_num_batched_tokens'] = mamba_block_size_multiplier * \
-                                        mamba_block_size - offsets
-        vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                     vllm_runner_kwargs,
-                                                     generated_prompts,
-                                                     max_tokens, num_logprobs,
-                                                     n_repetitions)
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )

        # Check alignment of the output logits when using APC
        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

    # Cache only part of all the prompts
-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
-        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
-        num_logprobs)
+        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
+    )

    compare_operator(
        outputs_0_lst=vllm_outputs_no_cache[0][:3],
@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
        name_1="vllm_partial_cache",
    )

-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts,
-                                                 max_tokens,
-                                                 num_logprobs,
-                                                 n_repetitions,
-                                                 vllm_model=vllm_model)
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+        vllm_model=vllm_model,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@ -6,7 +6,9 @@ import json
 import pytest

 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
-    MistralToolCall, MistralToolParser)
+    MistralToolCall,
+    MistralToolParser,
+)
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer

@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
 ]

 # for function calling
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
            },
-            "required": ["city", "state", "unit"]
-        }
+        },
    },
-}, {
-    "type": "function",
-    "function": {
-        "name": "rewrite",
-        "description": "Rewrites text",
-        "parameters": {
-            "type": "object",
-            "required": [],
-            "properties": {
-                "text": {
-                    "type": "string",
-                    "description": "The input text to rewrite."
-                }
-            }
-        }
-    }
-}]
+    {
+        "type": "function",
+        "function": {
+            "name": "rewrite",
+            "description": "Rewrites text",
+            "parameters": {
+                "type": "object",
+                "required": [],
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The input text to rewrite.",
+                    }
+                },
+            },
+        },
+    },
+]
 MSGS = [
+    {"role": "system", "content": "You are an assistant."},
    {
-        "role": "system",
-        "content": "You are an assistant."
-    },
-    {
-        "role":
-        "user",
-        "content":
-        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
-    },
-    {
-        "role":
-        "assistant",
-        "content":
-        "",
-        "tool_calls": [{
-            "id": "bbc5b7ede",
-            "type": "function",
-            "function": {
-                "name":
-                "rewrite",
-                "arguments":
-                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
-            }
-        }]
-    },
-    {
-        "role": "tool",
-        "content":
-        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
-        "tool_call_id": "bbc5b7ede",
-        "name": "rewrite"
+        "role": "user",
+        "content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.",  # noqa
    },
    {
        "role": "assistant",
-        "content": "---\n\nMy English needs improving, maybe I make errors"
+        "content": "",
+        "tool_calls": [
+            {
+                "id": "bbc5b7ede",
+                "type": "function",
+                "function": {
+                    "name": "rewrite",
+                    "arguments": '{"text":"My English needs improvving, maybe I make errors."}',  # noqa
+                },
+            }
+        ],
    },
    {
-        "role":
-        "user",
-        "content": ("Can you tell me what the temperate"
-                    " will be in Dallas, in fahrenheit?")
-    }
+        "role": "tool",
+        "content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}',  # noqa
+        "tool_call_id": "bbc5b7ede",
+        "name": "rewrite",
+    },
+    {
+        "role": "assistant",
+        "content": "---\n\nMy English needs improving, maybe I make errors",
+    },
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
 ]

 SAMPLE_JSON_SCHEMA = {
    "type": "object",
    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
        "skills": {
            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
+            "items": {"type": "string", "maxLength": 10},
+            "minItems": 3,
        },
        "work_history": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "number"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
+                    "company": {"type": "string"},
+                    "duration": {"type": "number"},
+                    "position": {"type": "string"},
                },
-                "required": ["company", "position"]
-            }
-        }
+                "required": ["company", "position"],
+            },
+        },
    },
-    "required": ["name", "age", "skills", "work_history"]
+    "required": ["name", "age", "skills", "work_history"],
 }


@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # TODO(sang): Sliding window should be tested separately.
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model, dtype=dtype,
-                     tokenizer_mode="mistral") as vllm_model:
+    with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
-                        max_tokens: int, num_logprobs: int) -> None:
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
    ) as mistral_format_model:
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
+        model,
+        dtype=dtype,
+        tokenizer_mode="auto",
+        load_format="safetensors",
+        config_format="hf",
    ) as hf_format_model:
        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,

@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(vllm_runner, model: str,
-                                    dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=8192,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=8192,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
        for prompt in SYMBOLIC_LANG_PROMPTS:
            msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.llm.chat([msg],
-                                          sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()


@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
-
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
        msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.llm.chat(msgs,
-                                      tools=TOOLS,
-                                      sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.llm.chat(
+            msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
+        )

        tokenizer = vllm_model.llm.get_tokenizer()
        tool_parser = MistralToolParser(tokenizer)
@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
        assert parsed_message.tools_called

        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
-        assert parsed_message.tool_calls[
-            0].function.name == "get_current_weather"
-        assert parsed_message.tool_calls[
-            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.tool_calls[0].function.name == "get_current_weather"
+        assert (
+            parsed_message.tool_calls[0].function.arguments
+            == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
+        )  # noqa
        assert parsed_message.content is None


@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
        "city": "Dallas",
        "state": "TX",
        "unit": "fahrenheit",
-        "sub_dict": {
-            "foo": "bar",
-            "inner": {
-                "x": 1,
-                "y": 2
-            }
-        },
+        "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
    }

-    model_output = (
-        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+    model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"

    parsed = parser.extract_tool_calls(model_output, None)

--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@ -15,62 +15,56 @@ MODELS = [

 def test_phimoe_routing_function():
    from vllm.model_executor.models.phimoe import phimoe_routing_function
+
    test_case = {
        0: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.1, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
        },
        1: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.4, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
-        }
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
    }

    ground_truth = {
        0: {
-            "topk_weights":
-            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+            "topk_weights": torch.tensor(
+                [1.0, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
        },
        1: {
-            "topk_weights":
-            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
-        }
+            "topk_weights": torch.tensor(
+                [0.5, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        },
    }

    for test_id in test_case:
        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
-        assert torch.allclose(topk_weights,
-                              ground_truth[test_id]["topk_weights"])
+        assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])


-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="This test takes a lot time to run on CPU, "
-                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="This test takes a lot time to run on CPU, "
+    "and vllm CI's disk space is not enough for this model.",
+)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@ -87,11 +81,13 @@ def test_models(
 ) -> None:
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
--- a/tests/models/language/generation_ppl_test/ppl_utils.py
+++ b/tests/models/language/generation_ppl_test/ppl_utils.py
@ -8,8 +8,7 @@ import torch
 from datasets import load_dataset

 import tests.ci_envs as ci_envs
-from tests.models.utils import (GenerateModelInfo,
-                                TokensTextLogprobsPromptLogprobs)
+from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
 from vllm.logprobs import Logprob

 # See #24485
@ -18,13 +17,14 @@ MAX_LENGTH = 1024


@torch.inference_mode
-def wikitext_ppl_test(hf_runner,
-                      vllm_runner,
-                      model_info: GenerateModelInfo,
-                      max_length=MAX_LENGTH,
-                      vllm_extra_kwargs=None,
-                      atol=PPL_TOL):
-
+def wikitext_ppl_test(
+    hf_runner,
+    vllm_runner,
+    model_info: GenerateModelInfo,
+    max_length=MAX_LENGTH,
+    vllm_extra_kwargs=None,
+    atol=PPL_TOL,
+):
    # A model family has many models with the same architecture,
    # and we don't need to test each one.
    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -44,15 +44,16 @@ def wikitext_ppl_test(hf_runner,
    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
        if "hf_overrides" not in vllm_extra_kwargs:
            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
+        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE

-    with vllm_runner(model_info.name,
-                     gpu_memory_utilization=0.7,
-                     max_model_len=max_length,
-                     max_num_seqs=1,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model_info.name,
+        gpu_memory_utilization=0.7,
+        max_model_len=max_length,
+        max_num_seqs=1,
+        enforce_eager=True,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
        # Use max_num_seqs=1 to avoid OOM,
        # and avoid batch different requests together.

@ -60,7 +61,7 @@ def wikitext_ppl_test(hf_runner,

        # Confirm whether vllm is using the correct architecture
        if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
+            assert model_info.architecture in model_config.architectures

        max_length = min(model_config.max_model_len - 1, max_length)
        stride = max_length
@ -74,12 +75,14 @@ def wikitext_ppl_test(hf_runner,
            end_loc = min(begin_loc + max_length, n_tokens)
            chunks.append(tokens[begin_loc:end_loc])

-        outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
-                                                      max_tokens=1,
-                                                      num_logprobs=None,
-                                                      num_prompt_logprobs=0,
-                                                      use_tqdm=False)
-        nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+        outputs = vllm_model.generate_greedy_logprobs(
+            prompts=chunks,
+            max_tokens=1,
+            num_logprobs=None,
+            num_prompt_logprobs=0,
+            use_tqdm=False,
+        )
+        nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
        n_tokens = 0
        for output in outputs:
            output = cast(TokensTextLogprobsPromptLogprobs, output)
@ -94,7 +97,8 @@ def wikitext_ppl_test(hf_runner,
                token_log_probs.append(token_log_prob)

            neg_log_likelihood = -torch.tensor(
-                token_log_probs, dtype=torch.float32, device="cpu").sum()
+                token_log_probs, dtype=torch.float32, device="cpu"
+            ).sum()
            nll_sum += neg_log_likelihood
            n_tokens += len(token_log_probs)
        vllm_ppl = float(torch.exp(nll_sum / n_tokens))
@ -104,14 +108,13 @@ def wikitext_ppl_test(hf_runner,
    # Accelerate ppl test by setting Transformers ppl score to a constant
    if model_info.hf_ppl is None:
        with hf_runner(
-                model_info.name,
-                dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+            model_info.name,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
        ) as hf_model:
-            nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+            nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
            n_tokens = 0
            for chunk in chunks:
-                inputs = hf_model.wrap_device(
-                    {"input_ids": torch.tensor([chunk])})
+                inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
                input_ids = inputs["input_ids"]
                outputs = hf_model.model(input_ids, labels=input_ids)
                neg_log_likelihood = outputs.loss
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@ -6,8 +6,7 @@ from typing import Optional
 import pytest

 from tests.conftest import HfRunner
-from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
-                                matryoshka_fy)
+from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy


 def run_embedding_correctness_test(
@ -29,12 +28,14 @@ def run_embedding_correctness_test(
    )


-def correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts,
-                                  vllm_extra_kwargs=None,
-                                  hf_model_callback=None):
+def correctness_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    example_prompts,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+):
    pytest.skip("Debug only, ci prefers to use mteb test.")

    # The example_prompts has ending "\n", for example:
@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(
-            model_info.name,
-            dtype=model_info.hf_dtype,
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype=model_info.hf_dtype,
+        is_sentence_transformer=True,
    ) as hf_model:
-
        if hf_model_callback is not None:
            hf_model_callback(hf_model)

--- a/tests/models/language/pooling/test_auto_prefix_cache_support.py
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@ -4,8 +4,7 @@ import pytest
 import torch
 from transformers import AutoModelForSequenceClassification

-from tests.models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test


@pytest.mark.parametrize(
@ -20,28 +19,27 @@ def test_classify_models(
    model: str,
    dtype: str,
 ) -> None:
-
    example_prompts = example_prompts * 2

-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     enable_prefix_caching=True) as vllm_model:
+    with vllm_runner(
+        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
+    ) as vllm_model:
        cache_config = vllm_model.llm.llm_engine.cache_config
        assert cache_config.enable_prefix_caching
        vllm_outputs = vllm_model.classify(example_prompts)

-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output)
        vllm_output = torch.tensor(vllm_output)

-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )


@pytest.mark.parametrize(
@ -59,18 +57,18 @@ def test_embed_models(
    example_prompts = [str(s).strip() for s in example_prompts] * 2

    with vllm_runner(
-            model,
-            runner="pooling",
-            max_model_len=None,
-            enable_prefix_caching=True,
+        model,
+        runner="pooling",
+        max_model_len=None,
+        enable_prefix_caching=True,
    ) as vllm_model:
        cache_config = vllm_model.llm.llm_engine.cache_config
        assert cache_config.enable_prefix_caching
        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(
-            model,
-            is_sentence_transformer=True,
+        model,
+        is_sentence_transformer=True,
    ) as hf_model:
        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)

@ -81,13 +79,14 @@ def test_embed_models(
        "intfloat/e5-small",
        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",  # is_causal == False
        "papluca/xlm-roberta-base-language-detection",
-    ])
+    ],
+)
@pytest.mark.parametrize("dtype", ["half"])
-def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
-                           dtype: str) -> None:
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     enable_prefix_caching=True) as vllm_model:
+def test_non_causal_models(
+    hf_runner, vllm_runner, example_prompts, model: str, dtype: str
+) -> None:
+    with vllm_runner(
+        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
+    ) as vllm_model:
        cache_config = vllm_model.llm.llm_engine.cache_config
        assert not cache_config.enable_prefix_caching
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@ -10,15 +10,17 @@ from vllm.platforms import current_platform
@pytest.mark.parametrize(
    "model",
    [
-        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.slow_test
-                     ]),
+        pytest.param(
+            "jason9693/Qwen2.5-1.5B-apeach",
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
+            ],
+        ),
    ],
 )
-@pytest.mark.parametrize("dtype",
-                         ["half"] if current_platform.is_rocm() else ["float"])
+@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
 def test_models(
    hf_runner,
    vllm_runner,
@ -35,9 +37,9 @@ def test_models(
    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)

-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    # check logits difference
@ -48,5 +50,6 @@ def test_models(
        # the tolerance value of 1e-2 is selected based on the
        # half datatype tests in
        # tests/models/language/pooling/test_embedding.py
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -18,20 +18,25 @@ from ...utils import check_embeddings_close
        # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
        # model code with bidirectional attention.
        # [Decoder-only]
-        pytest.param("BAAI/bge-multilingual-gemma2",
-                     marks=[pytest.mark.core_model, pytest.mark.slow_test]),
+        pytest.param(
+            "BAAI/bge-multilingual-gemma2",
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+        ),
        pytest.param(
            "intfloat/e5-mistral-7b-instruct",
            # CPU v1 doesn't support sliding window
-            marks=[pytest.mark.core_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.cpu_model]),
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
+        ),
        # [Encoder-only]
        pytest.param(
            "BAAI/bge-base-en-v1.5",
            marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
            ],
        ),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
@ -50,7 +55,6 @@ def test_models(
    model,
    monkeypatch,
 ) -> None:
-
    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend
@ -58,13 +62,14 @@ def test_models(

    vllm_extra_kwargs = {}
    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
-        vllm_extra_kwargs["pooler_config"] = \
-            PoolerConfig(pooling_type="MEAN", normalize=False)
+        vllm_extra_kwargs["pooler_config"] = PoolerConfig(
+            pooling_type="MEAN", normalize=False
+        )

    max_model_len: Optional[int] = 512
    if model in [
-            "sentence-transformers/all-MiniLM-L12-v2",
-            "sentence-transformers/stsb-roberta-base-v2"
+        "sentence-transformers/all-MiniLM-L12-v2",
+        "sentence-transformers/stsb-roberta-base-v2",
    ]:
        max_model_len = None

@ -79,10 +84,9 @@ def test_models(
    with hf_runner(model, is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

-    with vllm_runner(model,
-                     runner="pooling",
-                     max_model_len=max_model_len,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

    check_embeddings_close(
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@ -70,8 +70,9 @@ async def run_client_embeddings(


 def gritlm_instruction(instruction):
-    return ("<|user|>\n" + instruction +
-            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+    return (
+        "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
+    )


 def get_test_data():
@ -80,7 +81,8 @@ def get_test_data():
    README.md in https://github.com/ContextualAI/gritlm
    """
    q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract", )
+        "Given a scientific paper title, retrieve the paper's abstract",
+    )
    queries = [
        "Bitcoin: A Peer-to-Peer Electronic Cash System",
        "Generative Representational Instruction Tuning",
@ -114,9 +116,9 @@ def test_gritlm_offline_embedding(vllm_runner):
    queries, q_instruction, documents, d_instruction = get_test_data()

    with vllm_runner(
-            MODEL_NAME,
-            runner="pooling",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        runner="pooling",
+        max_model_len=MAX_MODEL_LEN,
    ) as vllm_model:
        llm = vllm_model.llm

@ -161,9 +163,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"

    with vllm_runner(
-            MODEL_NAME,
-            runner="generate",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        runner="generate",
+        max_model_len=MAX_MODEL_LEN,
    ) as vllm_model:
        llm = vllm_model.llm

--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@ -21,16 +21,18 @@ def test_idefics_multimodal(
        "The future of AI is",
    ]

-    with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
-                     runner="pooling",
-                     task="classify",
-                     convert="classify",
-                     load_format="dummy",
-                     max_model_len=512,
-                     enforce_eager=True,
-                     tensor_parallel_size=1,
-                     disable_log_stats=True,
-                     dtype="bfloat16") as vllm_model:
+    with vllm_runner(
+        model_name="HuggingFaceM4/Idefics3-8B-Llama3",
+        runner="pooling",
+        task="classify",
+        convert="classify",
+        load_format="dummy",
+        max_model_len=512,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_log_stats=True,
+        dtype="bfloat16",
+    ) as vllm_model:
        llm = vllm_model.get_llm()
        outputs = llm.classify(prompts)
        for output in outputs:
@ -38,19 +40,20 @@ def test_idefics_multimodal(


 def update_config(config):
-    config.text_config.update({
-        "architectures": ["Gemma3ForSequenceClassification"],
-        "classifier_from_token": ["A", "B", "C", "D", "E"],
-        "method":
-        "no_post_processing",
-        "id2label": {
-            "A": "Chair",
-            "B": "Couch",
-            "C": "Table",
-            "D": "Bed",
-            "E": "Cupboard"
-        },
-    })
+    config.text_config.update(
+        {
+            "architectures": ["Gemma3ForSequenceClassification"],
+            "classifier_from_token": ["A", "B", "C", "D", "E"],
+            "method": "no_post_processing",
+            "id2label": {
+                "A": "Chair",
+                "B": "Couch",
+                "C": "Table",
+                "D": "Bed",
+                "E": "Cupboard",
+            },
+        }
+    )
    return config


@ -63,11 +66,10 @@ def test_gemma_multimodal(
        # switch to use ROCm CK FA backend
        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

-    messages = [{
-        "role":
-        "system",
-        "content":
-        """
+    messages = [
+        {
+            "role": "system",
+            "content": """
    You are a helpful assistant. You will be given a product description
    which may also include an image. Classify the following product into
    one of the categories:
@ -78,38 +80,39 @@ def test_gemma_multimodal(
    D = bed
    E = cupboard

-    You'll answer with exactly one letter (A, B, C, D, or E)."""
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url":
-                "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
-            }
-        }, {
-            "type": "text",
-            "text": "A fine 19th century piece of furniture."
-        }]
-    }]
-
-    with vllm_runner(model_name="google/gemma-3-4b-it",
-                     runner="pooling",
-                     task="classify",
-                     convert="classify",
-                     load_format="auto",
-                     hf_overrides=update_config,
-                     pooler_config=PoolerConfig(pooling_type="LAST"),
-                     max_model_len=512,
-                     enforce_eager=True,
-                     tensor_parallel_size=1,
-                     disable_log_stats=True,
-                     dtype="bfloat16") as vllm_model:
+    You'll answer with exactly one letter (A, B, C, D, or E).""",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+                    },
+                },
+                {"type": "text", "text": "A fine 19th century piece of furniture."},
+            ],
+        },
+    ]

+    with vllm_runner(
+        model_name="google/gemma-3-4b-it",
+        runner="pooling",
+        task="classify",
+        convert="classify",
+        load_format="auto",
+        hf_overrides=update_config,
+        pooler_config=PoolerConfig(pooling_type="LAST"),
+        max_model_len=512,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_log_stats=True,
+        dtype="bfloat16",
+    ) as vllm_model:
        llm = vllm_model.get_llm()
        prompts = llm.preprocess_chat(messages)

        result = llm.classify(prompts)
        assert result[0].outputs.probs[0] > 0.95
-        assert all(c < 0.05 for c in result[0].outputs.probs[1:])
+        assert all(c < 0.05 for c in result[0].outputs.probs[1:])
--- a/tests/models/language/pooling/test_multilabel_classification_support.py
+++ b/tests/models/language/pooling/test_multilabel_classification_support.py
@ -20,14 +20,15 @@ def test_classify_models(
    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)

-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output)
        vllm_output = torch.tensor(vllm_output)

-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo

 MODELS = [
    EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
-    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
-    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    # EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    # EmbedModelInfo("nomic-ai/CodeRankEmbed"),
    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
-    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+    # EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
 ]

 rope_theta = 1000
@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)

@pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
-    with vllm_runner(model_info.name, runner="pooling",
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None
+    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
            # For nomic-embed-text-v2-moe the length is set to 512
            # by sentence_bert_config.json.
            assert model_config.max_model_len == 512
        else:
-            assert (
-                model_config.max_model_len == original_max_position_embeddings)
+            assert model_config.max_model_len == original_max_position_embeddings


@pytest.mark.parametrize("model_info", MODELS)
 def test_set_max_model_len_legal(model_info, vllm_runner):
    # set max_model_len <= 512
-    with vllm_runner(model_info.name, runner="pooling",
-                     max_model_len=256) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=256
+    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.max_model_len == 256

@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
        # For nomic-embed-text-v2-moe the length is set to 512
        # by sentence_bert_config.json.
        with pytest.raises(ValueError):
-            with vllm_runner(model_info.name,
-                             runner="pooling",
-                             max_model_len=1024):
+            with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
                pass
    else:
-        with vllm_runner(model_info.name, runner="pooling",
-                         max_model_len=1024) as vllm_model:
+        with vllm_runner(
+            model_info.name, runner="pooling", max_model_len=1024
+        ) as vllm_model:
            model_config = vllm_model.llm.llm_engine.model_config
            assert model_config.max_model_len == 1024

@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
 def test_set_max_model_len_illegal(model_info, vllm_runner):
    # set max_model_len > 2048
    with pytest.raises(ValueError):
-        with vllm_runner(model_info.name, runner="pooling",
-                         max_model_len=4096):
+        with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
            pass

    # set max_model_len > 2048 by hf_overrides
    hf_overrides = {"max_model_len": 4096}
    with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=None,
+            hf_overrides=hf_overrides,
+        ):
            pass


@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
        "rope_scaling": {
            "rope_type": "yarn",
            "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
        },
-        "max_model_len": max_model_len
+        "max_model_len": max_model_len,
    }

-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     hf_overrides=hf_overrides):
+    with vllm_runner(
+        model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
+    ):
        pass


@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
        "rope_scaling": {
            "rope_type": "yarn",
            "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
-        }
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
    }
    # illegal max_model_len
    with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=max_model_len + 1,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=max_model_len + 1,
+            hf_overrides=hf_overrides,
+        ):
            pass

    hf_overrides = {
@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
        "rope_scaling": {
            "rope_type": "yarn",
            "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
        },
-        "max_model_len": max_model_len + 1
+        "max_model_len": max_model_len + 1,
    }
    # illegal max_model_len by hf_overrides
    with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         runner="pooling",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            runner="pooling",
+            max_model_len=None,
+            hf_overrides=hf_overrides,
+        ):
            pass
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@ -10,10 +10,7 @@ from vllm.config import PoolerConfig

@pytest.mark.parametrize(
    "model",
-    [
-        "jason9693/Qwen2.5-1.5B-apeach",
-        "papluca/xlm-roberta-base-language-detection"
-    ],
+    ["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
 )
@pytest.mark.parametrize("dtype", ["half"])
 def test_classify_models_using_activation(
@ -23,30 +20,32 @@ def test_classify_models_using_activation(
    model: str,
    dtype: str,
 ) -> None:
-
    with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(activation=False)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(activation=False),
+    ) as vllm_model:
        wo_activation_out = vllm_model.classify(example_prompts)

    with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(activation=True)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(activation=True),
+    ) as vllm_model:
        w_activation_out = vllm_model.classify(example_prompts)

-    for wo_activation, w_activation in zip(wo_activation_out,
-                                           w_activation_out):
+    for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
        wo_activation = torch.tensor(wo_activation)
        w_activation = torch.tensor(w_activation)

-        assert not torch.allclose(wo_activation, w_activation,
-                                  atol=1e-2), "pooler_config is not working"
-        assert torch.allclose(softmax(wo_activation), w_activation,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
+            "pooler_config is not working"
+        )
+        assert torch.allclose(
+            softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
+        )


@pytest.mark.parametrize(
@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
    model: str,
    dtype: str,
 ) -> None:
-
    with vllm_runner(
-            model,
-            max_model_len=512,
-            dtype=dtype,
-            pooler_config=PoolerConfig(normalize=False)) as vllm_model:
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))

-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(normalize=True)) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
        w_normalize = torch.tensor(vllm_model.embed(example_prompts))

-    assert not torch.allclose(
-        wo_normalize, w_normalize,
-        atol=1e-2), "pooler_config normalize is not working"
+    assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
+        "pooler_config normalize is not working"
+    )
    assert torch.allclose(
-        F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
-        atol=1e-2), "w_normal should be close to normal(wo_normal)."
+        F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
+    ), "w_normal should be close to normal(wo_normal)."


@pytest.mark.parametrize(
@ -99,25 +100,26 @@ def test_reward_models_using_softmax(
    model: str,
    dtype: str,
 ) -> None:
-
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(softmax=False)) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(softmax=False),
+    ) as vllm_model:
        wo_softmax = vllm_model.encode(example_prompts)

-    with vllm_runner(model,
-                     max_model_len=1024,
-                     dtype=dtype,
-                     pooler_config=PoolerConfig(softmax=True)) as vllm_model:
+    with vllm_runner(
+        model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
+    ) as vllm_model:
        w_softmax = vllm_model.encode(example_prompts)

    for wo, w in zip(wo_softmax, w_softmax):
        wo = torch.tensor(wo)
        w = torch.tensor(w)

-        assert not torch.allclose(
-            wo, w, atol=1e-2), "pooler_config softmax is not working"
-        assert torch.allclose(
-            softmax(wo), w,
-            atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config softmax is not working"
+        )
+        assert torch.allclose(softmax(wo), w, atol=1e-2), (
+            "w_softmax should be close to softmax(wo_softmax)."
+        )
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@ -16,10 +16,8 @@ from ...utils import check_transformers_version
 def math_step_prompts():
    # ruff: noqa: E501
    data = {
-        "system":
-        "Please reason step by step, and put your final answer within \\boxed{}. ",
-        "query":
-        "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
+        "system": "Please reason step by step, and put your final answer within \\boxed{}. ",
+        "query": "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
        "response": [
            "To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
            "On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
@ -27,16 +25,16 @@ def math_step_prompts():
            "To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
        ],
    }
-    answer = "<extra_0>".join(data['response']) + "<extra_0>"
+    answer = "<extra_0>".join(data["response"]) + "<extra_0>"
    prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
    return [prompt]


 def step_reward_patch_hf_model(hf_model: HfRunner):
-
    # Patch the hf_runner to use the step reward function
-    def make_step_rewards(logits: torch.Tensor,
-                          token_masks: torch.Tensor) -> list[list[float]]:
+    def make_step_rewards(
+        logits: torch.Tensor, token_masks: torch.Tensor
+    ) -> list[list[float]]:
        probabilities = F.softmax(logits, dim=-1)
        probabilities = probabilities * token_masks.unsqueeze(-1)

@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
        outputs = hf_model.model(input_ids=input_ids)

        step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
-        token_masks = (input_ids == step_sep_id)
+        token_masks = input_ids == step_sep_id
        return make_step_rewards(outputs[0], token_masks)

    hf_model.reward = reward  # type: ignore[attr-defined]
@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
@pytest.mark.parametrize(
    "model",
    [
-        pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
    ],
 )
@pytest.mark.parametrize("dtype", ["half"])
@ -78,8 +78,9 @@ def test_prm_models(
    dtype: str,
    monkeypatch,
 ) -> None:
-    check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
-                               max_transformers_version="4.53.2")
+    check_transformers_version(
+        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
+    )

    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
        hf_outputs = hf_model.predict([text_pair]).tolist()

-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

    assert len(vllm_outputs) == 1
@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
        hf_outputs = hf_model.predict(text_pairs).tolist()

-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

    assert len(vllm_outputs) == 2
@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
        hf_outputs = hf_model.predict(text_pairs).tolist()

-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

    assert len(vllm_outputs) == 2
@ -101,17 +98,15 @@ def emb_model_name(request):
 def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
    text_pair = [TEXTS_1[0], TEXTS_2[0]]

-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
        hf_embeddings = hf_model.encode(text_pair)
-        hf_outputs = [
-            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
-        ]
+        hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]

-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

    assert len(vllm_outputs) == 1
@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
        [TEXTS_1[0], TEXTS_2[1]],
    ]

-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
        hf_outputs = [
            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
            for pair in hf_embeddings
        ]

-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

    assert len(vllm_outputs) == 2
@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
        [TEXTS_1[1], TEXTS_2[1]],
    ]

-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
        hf_outputs = [
            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
            for pair in hf_embeddings
        ]

-    with vllm_runner(emb_model_name,
-                     runner="pooling",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

    assert len(vllm_outputs) == 2
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@ -21,9 +21,9 @@ def test_models(
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)

-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForTokenClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
 field."""


-def test_smaller_truncation_size(vllm_runner,
-                                 model_name=MODEL_NAME,
-                                 input_str=input_str):
-
+def test_smaller_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
    truncate_prompt_tokens = 10

-    with vllm_runner(model_name, runner="pooling",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", max_model_len=max_model_len
+    ) as vllm_model:
        vllm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )

    prompt_tokens = vllm_output[0].prompt_token_ids

    assert len(prompt_tokens) == truncate_prompt_tokens


-def test_max_truncation_size(vllm_runner,
-                             model_name=MODEL_NAME,
-                             input_str=input_str):
+def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
    truncate_prompt_tokens = -1

-    with vllm_runner(model_name, runner="pooling",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, runner="pooling", max_model_len=max_model_len
+    ) as vllm_model:
        vllm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )

    prompt_tokens = vllm_output[0].prompt_token_ids

    assert len(prompt_tokens) == max_model_len


-def test_bigger_truncation_size(vllm_runner,
-                                model_name=MODEL_NAME,
-                                input_str=input_str):
-
+def test_bigger_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
    truncate_prompt_tokens = max_model_len + 1

-    with pytest.raises(ValueError), vllm_runner(
-            model_name, runner="pooling",
-            max_model_len=max_model_len) as vllm_model:
-
+    with (
+        pytest.raises(ValueError),
+        vllm_runner(
+            model_name, runner="pooling", max_model_len=max_model_len
+        ) as vllm_model,
+    ):
        llm_output = vllm_model.llm.embed(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )

-        assert llm_output == f"""truncate_prompt_tokens value 
+        assert (
+            llm_output
+            == f"""truncate_prompt_tokens value 
                ({truncate_prompt_tokens}) is greater than 
                max_model_len ({max_model_len}). Please, select 
                a smaller truncation size."""
+        )
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@ -12,8 +12,7 @@ import requests
 import torch

 import tests.ci_envs as ci_envs
-from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
-                                check_embeddings_close)
+from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close

 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@ -30,7 +29,6 @@ MTEB_RERANK_TOL = 2e-3


 class VllmMtebEncoder(mteb.Encoder):
-
    def __init__(self, vllm_model):
        super().__init__()
        self.llm = vllm_model
@ -53,8 +51,7 @@ class VllmMtebEncoder(mteb.Encoder):

    def predict(
        self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
@ -64,17 +61,15 @@ class VllmMtebEncoder(mteb.Encoder):
        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

-        outputs = self.llm.score(queries,
-                                 corpus,
-                                 truncate_prompt_tokens=-1,
-                                 use_tqdm=False)
+        outputs = self.llm.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores


 class OpenAIClientMtebEncoder(mteb.Encoder):
-
    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
@ -87,8 +82,9 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

-        embeddings = self.client.embeddings.create(model=self.model_name,
-                                                   input=sentences)
+        embeddings = self.client.embeddings.create(
+            model=self.model_name, input=sentences
+        )
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
@ -96,7 +92,6 @@ class OpenAIClientMtebEncoder(mteb.Encoder):


 class ScoreClientMtebEncoder(mteb.Encoder):
-
    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
@ -105,8 +100,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):

    def predict(
        self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
@ -122,27 +116,30 @@ class ScoreClientMtebEncoder(mteb.Encoder):
        return scores

    def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "text_1": query,
-                                     "text_2": corpus,
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['data'][0]["score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "text_1": query,
+                "text_2": corpus,
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["data"][0]["score"]


 class RerankClientMtebEncoder(ScoreClientMtebEncoder):
-
    def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "query": query,
-                                     "documents": [corpus],
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['results'][0]["relevance_score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "query": query,
+                "documents": [corpus],
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["results"][0]["relevance_score"]


 def run_mteb_embed_task(encoder, tasks):
@ -161,12 +158,14 @@ def run_mteb_embed_task(encoder, tasks):
    return main_score


-def mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None,
-                           hf_model_callback=None,
-                           atol=MTEB_EMBED_TOL):
+def mteb_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    atol=MTEB_EMBED_TOL,
+):
    # A model family has many models with the same architecture,
    # and we don't need to test each one.
    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -187,15 +186,15 @@ def mteb_test_embed_models(hf_runner,
    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
        if "hf_overrides" not in vllm_extra_kwargs:
            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
+        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE

+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=None,
+        enforce_eager=True,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config

        # Confirm whether vllm is using the correct architecture
@ -204,28 +203,29 @@ def mteb_test_embed_models(hf_runner,

        # Confirm whether vllm uses the correct default_pooling_type, which
        # relates to whether chunked prefill and prefix caching are enabled
-        assert (model_config._model_info.default_pooling_type ==
-                model_info.default_pooling_type)
+        assert (
+            model_config._model_info.default_pooling_type
+            == model_info.default_pooling_type
+        )

-        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
-                                              MTEB_EMBED_TASKS)
+        vllm_main_score = run_mteb_embed_task(
+            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
+        )
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
        head_dtype = model_config.head_dtype

        # Test embed_dims, isnan and whether to use normalize
-        vllm_outputs = vllm_model.embed(example_prompts,
-                                        truncate_prompt_tokens=-1)
+        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))

    # Accelerate mteb test by setting
    # SentenceTransformers mteb score to a constant
    if model_info.mteb_score is None:
        with hf_runner(
-                model_info.name,
-                is_sentence_transformer=True,
-                dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+            model_info.name,
+            is_sentence_transformer=True,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
        ) as hf_model:
-
            # e.g. setting default parameters for the encode method of hf_runner
            if hf_model_callback is not None:
                hf_model_callback(hf_model)
@ -247,8 +247,7 @@ def mteb_test_embed_models(hf_runner,
        st_dtype = "Constant"

    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
-          vllm_main_score)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

@ -282,26 +281,21 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
-            previous_results=
-            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
+            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


-def mteb_test_rerank_models_hf(hf_runner,
-                               model_name,
-                               hf_dtype="float32",
-                               hf_model_callback=None):
-    with hf_runner(model_name, is_cross_encoder=True,
-                   dtype=hf_dtype) as hf_model:
-
+def mteb_test_rerank_models_hf(
+    hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
+):
+    with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
        original_predict = hf_model.predict

        def _predict(
-            sentences: list[tuple[str, str,
-                                  Optional[str]]],  # query, corpus, prompt
+            sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
@ -315,20 +309,22 @@ def mteb_test_rerank_models_hf(hf_runner,
        if hf_model_callback is not None:
            hf_model_callback(hf_model)

-        st_main_score = run_mteb_rerank(hf_model,
-                                        tasks=MTEB_RERANK_TASKS,
-                                        languages=MTEB_RERANK_LANGS)
+        st_main_score = run_mteb_rerank(
+            hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
+        )
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


-def mteb_test_rerank_models(hf_runner,
-                            vllm_runner,
-                            model_info: RerankModelInfo,
-                            vllm_extra_kwargs=None,
-                            hf_model_callback=None,
-                            vllm_mteb_encoder=VllmMtebEncoder,
-                            atol=MTEB_RERANK_TOL):
+def mteb_test_rerank_models(
+    hf_runner,
+    vllm_runner,
+    model_info: RerankModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    vllm_mteb_encoder=VllmMtebEncoder,
+    atol=MTEB_RERANK_TOL,
+):
    # A model family has many models with the same architecture,
    # and we don't need to test each one.
    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@ -346,33 +342,37 @@ def mteb_test_rerank_models(hf_runner,
    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
        if "hf_overrides" not in vllm_extra_kwargs:
            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"][
-            "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     max_model_len=None,
-                     max_num_seqs=8,
-                     enforce_eager=True,
-                     **vllm_extra_kwargs) as vllm_model:
+        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE

+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=None,
+        max_num_seqs=8,
+        enforce_eager=True,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config

        # Confirm whether vllm is using the correct architecture
        if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
+            assert model_info.architecture in model_config.architectures

        # Score API is only enabled for num_labels == 1
        assert model_config.hf_config.num_labels == 1

        # Confirm whether vllm uses the correct default_pooling_type, which
        # relates to whether chunked prefill and prefix caching are enabled
-        assert (model_config._model_info.default_pooling_type ==
-                model_info.default_pooling_type)
+        assert (
+            model_config._model_info.default_pooling_type
+            == model_info.default_pooling_type
+        )

-        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
-                                          tasks=MTEB_RERANK_TASKS,
-                                          languages=MTEB_RERANK_LANGS)
+        vllm_main_score = run_mteb_rerank(
+            vllm_mteb_encoder(vllm_model),
+            tasks=MTEB_RERANK_TASKS,
+            languages=MTEB_RERANK_LANGS,
+        )
        vllm_dtype = model_config.dtype
        head_dtype = model_config.head_dtype

@ -380,14 +380,14 @@ def mteb_test_rerank_models(hf_runner,
    # SentenceTransformers mteb score to a constant
    if model_info.mteb_score is None:
        st_main_score, st_dtype = mteb_test_rerank_models_hf(
-            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
+            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
+        )
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"

    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
-          vllm_main_score)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@ -2,67 +2,76 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+    RerankModelInfo,
+)

 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
-                             architecture="BertModel",
-                             mteb_score=0.779336792,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-en",
+        architecture="BertModel",
+        mteb_score=0.779336792,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-en", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-en", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo("BAAI/bge-m3",
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.787343078,
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "BAAI/bge-m3",
+        architecture="XLMRobertaModel",
+        mteb_score=0.787343078,
+        enable_test=True,
+    ),
    ########## Qwen2Model
-    LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
-                              architecture="Qwen2Model",
-                              mteb_score=0.75724465,
-                              dtype="float32",
-                              enable_test=True),
+    LASTPoolingEmbedModelInfo(
+        "BAAI/bge-code-v1",
+        architecture="Qwen2Model",
+        mteb_score=0.75724465,
+        dtype="float32",
+        enable_test=True,
+    ),
 ]

 RERANK_MODELS = [
@ -71,33 +80,35 @@ RERANK_MODELS = [
        "BAAI/bge-reranker-base",
        architecture="XLMRobertaForSequenceClassification",
        mteb_score=0.32398,
-        enable_test=True),
+        enable_test=True,
+    ),
    CLSPoolingRerankModelInfo(
        "BAAI/bge-reranker-large",
        architecture="XLMRobertaForSequenceClassification",
-        enable_test=False),
+        enable_test=False,
+    ),
    CLSPoolingRerankModelInfo(
        "BAAI/bge-reranker-v2-m3",
        architecture="XLMRobertaForSequenceClassification",
-        enable_test=False)
+        enable_test=False,
+    ),
 ]


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)


@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@ -8,53 +8,50 @@ import torch

 from tests.conftest import HfRunner
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebEncoder, mteb_test_rerank_models)
+    VllmMtebEncoder,
+    mteb_test_rerank_models,
+)
 from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification",
-                               mteb_score=0.33757,
-                               hf_overrides={
-                                   "architectures":
-                                   ["GemmaForSequenceClassification"],
-                                   "classifier_from_token": ["Yes"],
-                                   "method":
-                                   "no_post_processing",
-                               }),
+    LASTPoolingRerankModelInfo(
+        "BAAI/bge-reranker-v2-gemma",
+        architecture="GemmaForSequenceClassification",
+        mteb_score=0.33757,
+        hf_overrides={
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        },
+    ),
 ]

 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501


 class GemmaRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
+
        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")

    @torch.no_grad()
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
        def get_inputs(pairs, tokenizer, prompt=None):
            if prompt is None:
                prompt = PROMPT

            sep = "\n"
-            prompt_inputs = tokenizer(prompt,
-                                      return_tensors=None,
-                                      add_special_tokens=False)["input_ids"]
-            sep_inputs = tokenizer(sep,
-                                   return_tensors=None,
-                                   add_special_tokens=False)["input_ids"]
+            prompt_inputs = tokenizer(
+                prompt, return_tensors=None, add_special_tokens=False
+            )["input_ids"]
+            sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
+                "input_ids"
+            ]
            inputs = []
            for query, passage in pairs:
                query_inputs = tokenizer(
@ -78,8 +75,7 @@ class GemmaRerankerHfRunner(HfRunner):
                    return_token_type_ids=False,
                    add_special_tokens=False,
                )
-                item["input_ids"] = item[
-                    "input_ids"] + sep_inputs + prompt_inputs
+                item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
                item["attention_mask"] = [1] * len(item["input_ids"])
                inputs.append(item)
            return tokenizer.pad(
@ -95,14 +91,19 @@ class GemmaRerankerHfRunner(HfRunner):
            inputs = inputs.to(self.model.device)
            _n_tokens = inputs["input_ids"].shape[1]
            logits = self.model(**inputs, return_dict=True).logits
-            _scores = (logits[:, -1,
-                              self.yes_loc].view(-1, ).float().sigmoid())
+            _scores = (
+                logits[:, -1, self.yes_loc]
+                .view(
+                    -1,
+                )
+                .float()
+                .sigmoid()
+            )
            scores.append(_scores[0].item())
        return torch.Tensor(scores)


 class GemmaMtebEncoder(VllmMtebEncoder):
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.query_template = "A: {query}\n"
@ -110,12 +111,10 @@ class GemmaMtebEncoder(VllmMtebEncoder):

    def predict(
        self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
-
        _sentences = []
        for query, corpus, prompt in sentences:
            query = self.query_template.format(query=query)
@ -127,8 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):

@pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-
-    mteb_test_rerank_models(GemmaRerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_mteb_encoder=GemmaMtebEncoder)
+    mteb_test_rerank_models(
+        GemmaRerankerHfRunner,
+        vllm_runner,
+        model_info,
+        vllm_mteb_encoder=GemmaMtebEncoder,
+    )
--- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@ -2,22 +2,30 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from tests.models.utils import (CLSPoolingRerankModelInfo,
-                                LASTPoolingRerankModelInfo, RerankModelInfo)
+from tests.models.utils import (
+    CLSPoolingRerankModelInfo,
+    LASTPoolingRerankModelInfo,
+    RerankModelInfo,
+)

 from .mteb_utils import mteb_test_rerank_models

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                              mteb_score=0.32898,
-                              architecture="BertForSequenceClassification"),
-    LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-                               mteb_score=0.25736,
-                               architecture="Qwen3ForSequenceClassification")
+    CLSPoolingRerankModelInfo(
+        "cross-encoder/ms-marco-TinyBERT-L-2-v2",
+        mteb_score=0.32898,
+        architecture="BertForSequenceClassification",
+    ),
+    LASTPoolingRerankModelInfo(
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+        mteb_score=0.25736,
+        architecture="Qwen3ForSequenceClassification",
+    ),
 ]


@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@ -3,74 +3,93 @@

 import pytest

-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+    RerankModelInfo,
+)

 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo("thenlper/gte-large",
-                             mteb_score=0.76807651,
-                             architecture="BertModel",
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("thenlper/gte-base",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-small",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-large",
+        mteb_score=0.76807651,
+        architecture="BertModel",
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-base", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-small", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-base-zh", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
+    ),
    ########### NewModel
    # These three architectures are almost the same, but not exactly the same.
    # For example,
    # - whether to use token_type_embeddings
    # - whether to use context expansion
    # So only test one (the most widely used) model
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
-                             architecture="GteNewModel",
-                             mteb_score=0.775074696,
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
-                             architecture="GteNewModel",
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
-                             architecture="GteNewModel",
-                             hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-multilingual-base",
+        architecture="GteNewModel",
+        mteb_score=0.775074696,
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-base-en-v1.5",
+        architecture="GteNewModel",
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-large-en-v1.5",
+        architecture="GteNewModel",
+        hf_overrides={"architectures": ["GteNewModel"]},
+        enable_test=False,
+    ),
    ########### Qwen2ForCausalLM
-    LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                              mteb_score=0.758473459018872,
-                              architecture="Qwen2ForCausalLM",
-                              enable_test=True),
+    LASTPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+        mteb_score=0.758473459018872,
+        architecture="Qwen2ForCausalLM",
+        enable_test=True,
+    ),
    ########## ModernBertModel
-    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
-                             mteb_score=0.748193353,
-                             architecture="ModernBertModel",
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "Alibaba-NLP/gte-modernbert-base",
+        mteb_score=0.748193353,
+        architecture="ModernBertModel",
+        enable_test=True,
+    ),
    ########## Qwen3ForCausalLM
-    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
-                              mteb_score=0.771163695,
-                              architecture="Qwen3ForCausalLM",
-                              dtype="float32",
-                              enable_test=True),
-    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
-                              architecture="Qwen3ForCausalLM",
-                              dtype="float32",
-                              enable_test=False),
+    LASTPoolingEmbedModelInfo(
+        "Qwen/Qwen3-Embedding-0.6B",
+        mteb_score=0.771163695,
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=True,
+    ),
+    LASTPoolingEmbedModelInfo(
+        "Qwen/Qwen3-Embedding-4B",
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=False,
+    ),
 ]

 RERANK_MODELS = [
@ -79,31 +98,32 @@ RERANK_MODELS = [
        "Alibaba-NLP/gte-reranker-modernbert-base",
        mteb_score=0.33386,
        architecture="ModernBertForSequenceClassification",
-        enable_test=True),
+        enable_test=True,
+    ),
    CLSPoolingRerankModelInfo(
        "Alibaba-NLP/gte-multilingual-reranker-base",
        mteb_score=0.33062,
        architecture="GteNewForSequenceClassification",
        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
-        enable_test=True),
+        enable_test=True,
+    ),
 ]


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)


@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/language/pooling_mteb_test/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@ -2,50 +2,55 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo

 from .mteb_utils import mteb_test_embed_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo("intfloat/e5-small",
-                             architecture="BertModel",
-                             mteb_score=0.742285423,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("intfloat/e5-base",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/e5-large",
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
-                             architecture="BertModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-small",
+        architecture="BertModel",
+        mteb_score=0.742285423,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-base", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/e5-large", architecture="BertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
+    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.779325955,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
-                             architecture="XLMRobertaModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
-                             architecture="XLMRobertaModel",
-                             enable_test=False),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-base",
+        architecture="XLMRobertaModel",
+        mteb_score=0.779325955,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-large",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "intfloat/multilingual-e5-large-instruct",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
 ]


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@ -5,60 +5,68 @@ from functools import partial
 import pytest

 from tests.models.language.pooling.embed_utils import (
-    check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
-from tests.models.utils import (CLSPoolingEmbedModelInfo,
-                                CLSPoolingRerankModelInfo, EmbedModelInfo,
-                                RerankModelInfo)
+    check_embeddings_close,
+    correctness_test_embed_models,
+    matryoshka_fy,
+)
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    CLSPoolingRerankModelInfo,
+    EmbedModelInfo,
+    RerankModelInfo,
+)
 from vllm import PoolingParams

 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

 EMBEDDING_MODELS = [
-    CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
-                             mteb_score=0.824413164,
-                             architecture="XLMRobertaModel",
-                             is_matryoshka=True)
+    CLSPoolingEmbedModelInfo(
+        "jinaai/jina-embeddings-v3",
+        mteb_score=0.824413164,
+        architecture="XLMRobertaModel",
+        is_matryoshka=True,
+    )
 ]

 RERANK_MODELS = [
    CLSPoolingRerankModelInfo(
        "jinaai/jina-reranker-v2-base-multilingual",
        mteb_score=0.33643,
-        architecture="XLMRobertaForSequenceClassification")
+        architecture="XLMRobertaForSequenceClassification",
+    )
 ]


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
-
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    def hf_model_callback(model):
        model.encode = partial(model.encode, task="text-matching")

-    mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info,
-                           hf_model_callback=hf_model_callback)
+    mteb_test_embed_models(
+        hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
+    )


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
    def hf_model_callback(model):
        model.encode = partial(model.encode, task="text-matching")

-    correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info,
-                                  example_prompts,
-                                  hf_model_callback=hf_model_callback)
+    correctness_test_embed_models(
+        hf_runner,
+        vllm_runner,
+        model_info,
+        example_prompts,
+        hf_model_callback=hf_model_callback,
+    )


@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)


@ -81,32 +89,32 @@ def test_matryoshka(
    example_prompts = [str(s).strip() for s in example_prompts]

    with hf_runner(
-            model_info.name,
-            dtype=dtype,
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype=dtype,
+        is_sentence_transformer=True,
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
        hf_outputs = matryoshka_fy(hf_outputs, dimensions)

-    with vllm_runner(model_info.name,
-                     runner="pooling",
-                     dtype=dtype,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_info.name, runner="pooling", dtype=dtype, max_model_len=None
+    ) as vllm_model:
        assert vllm_model.llm.llm_engine.model_config.is_matryoshka

        matryoshka_dimensions = (
-            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
+            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
+        )
        assert matryoshka_dimensions is not None

        if dimensions not in matryoshka_dimensions:
            with pytest.raises(ValueError):
                vllm_model.embed(
-                    example_prompts,
-                    pooling_params=PoolingParams(dimensions=dimensions))
+                    example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+                )
        else:
            vllm_outputs = vllm_model.embed(
-                example_prompts,
-                pooling_params=PoolingParams(dimensions=dimensions))
+                example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+            )

            check_embeddings_close(
                embeddings_0_lst=hf_outputs,
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@ -17,46 +17,45 @@ mxbai_rerank_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
-                               architecture="Qwen2ForSequenceClassification",
-                               hf_overrides=mxbai_rerank_hf_overrides,
-                               mteb_score=0.273,
-                               enable_test=True),
-    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
-                               architecture="Qwen2ForSequenceClassification",
-                               hf_overrides=mxbai_rerank_hf_overrides,
-                               enable_test=False)
+    LASTPoolingRerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-base-v2",
+        architecture="Qwen2ForSequenceClassification",
+        hf_overrides=mxbai_rerank_hf_overrides,
+        mteb_score=0.273,
+        enable_test=True,
+    ),
+    LASTPoolingRerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-large-v2",
+        architecture="Qwen2ForSequenceClassification",
+        hf_overrides=mxbai_rerank_hf_overrides,
+        enable_test=False,
+    ),
 ]


 class MxbaiRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
+
        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)

-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")

-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
        def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
            for key in inputs:
                inputs[key] = inputs[key].to(self.model.device)
            return inputs
--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@ -3,39 +3,42 @@

 import pytest

-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo

 from .mteb_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
-                             architecture="NomicBertModel",
-                             mteb_score=0.737568559,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
-                             architecture="NomicBertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
-                             architecture="NomicBertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
-                             architecture="NomicBertModel",
-                             mteb_score=0.715488912,
-                             enable_test=True)
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1",
+        architecture="NomicBertModel",
+        mteb_score=0.737568559,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1.5",
+        architecture="NomicBertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
+    ),
+    CLSPoolingEmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v2-moe",
+        architecture="NomicBertModel",
+        mteb_score=0.715488912,
+        enable_test=True,
+    ),
 ]


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@ -18,46 +18,45 @@ qwen3_reranker_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
-                               architecture="Qwen3ForSequenceClassification",
-                               mteb_score=0.25736,
-                               hf_overrides=qwen3_reranker_hf_overrides,
-                               enable_test=True),
-    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
-                               architecture="Qwen3ForSequenceClassification",
-                               hf_overrides=qwen3_reranker_hf_overrides,
-                               enable_test=False)
+    LASTPoolingRerankModelInfo(
+        "Qwen/Qwen3-Reranker-0.6B",
+        architecture="Qwen3ForSequenceClassification",
+        mteb_score=0.25736,
+        hf_overrides=qwen3_reranker_hf_overrides,
+        enable_test=True,
+    ),
+    LASTPoolingRerankModelInfo(
+        "Qwen/Qwen3-Reranker-4B",
+        architecture="Qwen3ForSequenceClassification",
+        hf_overrides=qwen3_reranker_hf_overrides,
+        enable_test=False,
+    ),
 ]


 class Qwen3RerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
+
        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)

-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")

-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
        def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
            for key in inputs:
                inputs[key] = inputs[key].to(self.model.device)
            return inputs
@ -82,20 +81,18 @@ class Qwen3RerankerHfRunner(HfRunner):

@pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-
    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", RERANK_MODELS)
@multi_gpu_test(num_gpus=2)
-def test_rerank_models_mteb_tp(vllm_runner,
-                               model_info: RerankModelInfo) -> None:
-
+def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
    assert model_info.architecture == "Qwen3ForSequenceClassification"

    vllm_extra_kwargs: dict[str, Any] = {
        "tensor_parallel_size": 2,
    }

-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(
+        Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
+    )
--- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@ -3,62 +3,75 @@

 import pytest

-from tests.models.language.pooling.embed_utils import (
-    correctness_test_embed_models)
+from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo

 from .mteb_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             mteb_score=0.714927797,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
-                             is_matryoshka=False,
-                             architecture="NomicBertModel",
-                             mteb_score=0.681146831,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
-                             is_matryoshka=False,
-                             architecture="BertModel",
-                             enable_test=False),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                             is_matryoshka=True,
-                             architecture="BertModel",
-                             mteb_score=0.649088363,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
-                             is_matryoshka=True,
-                             architecture="XLMRobertaModel",
-                             mteb_score=0.712258299,
-                             enable_test=True),
-    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                             is_matryoshka=True,
-                             architecture="GteModel",
-                             mteb_score=0.706622444,
-                             enable_test=True),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-xs",
+        is_matryoshka=False,
+        architecture="BertModel",
+        mteb_score=0.714927797,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-s",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-long",
+        is_matryoshka=False,
+        architecture="NomicBertModel",
+        mteb_score=0.681146831,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        architecture="BertModel",
+        mteb_score=0.649088363,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l-v2.0",
+        is_matryoshka=True,
+        architecture="XLMRobertaModel",
+        mteb_score=0.712258299,
+        enable_test=True,
+    ),
+    CLSPoolingEmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v2.0",
+        is_matryoshka=True,
+        architecture="GteModel",
+        mteb_score=0.706622444,
+        enable_test=True,
+    ),
 ]


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@ -2,8 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
-                                LASTPoolingEmbedModelInfo)
+from tests.models.utils import (
+    CLSPoolingEmbedModelInfo,
+    EmbedModelInfo,
+    LASTPoolingEmbedModelInfo,
+)

 from .mteb_utils import mteb_test_embed_models

@ -15,15 +18,15 @@ ST_PROJECTOR_MODELS = [
        mteb_score=0.688611955,
        enable_test=True,
    ),
-    LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
-                              architecture="Gemma3TextModel",
-                              mteb_score=0.7473819294684156,
-                              enable_test=True)
+    LASTPoolingEmbedModelInfo(
+        "google/embeddinggemma-300m",
+        architecture="Gemma3TextModel",
+        mteb_score=0.7473819294684156,
+        enable_test=True,
+    ),
 ]


@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
-
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -3,27 +3,40 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+
 import math
 import os
 from collections import defaultdict
 from pathlib import PosixPath

 import pytest
-from transformers import (AutoModel, AutoModelForImageTextToText,
-                          AutoModelForTextToWaveform)
+from transformers import (
+    AutoModel,
+    AutoModelForImageTextToText,
+    AutoModelForTextToWaveform,
+)

 from vllm.platforms import current_platform
 from vllm.utils import identity

-from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
-                          ImageTestAssets, VideoTestAssets, VllmRunner)
-from ....utils import (create_new_process_for_each_test, large_gpu_mark,
-                       multi_gpu_marks)
+from ....conftest import (
+    IMAGE_ASSETS,
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
+from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
-from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
-                              VLMTestInfo, VLMTestType)
+from .vlm_utils.types import (
+    CustomTestOptions,
+    ExpandableVLMTestArgs,
+    VLMTestInfo,
+    VLMTestType,
+)

 # This hack is needed for phi3v & paligemma models
 # ROCm Triton FA can run into shared memory issues with these models,
@ -828,7 +841,7 @@ def _mark_splits(
    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
-        models_in_group = models[i * split_size:(i + 1) * split_size]
+        models_in_group = models[i * split_size : (i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_single_image_models(
    tmp_path: PosixPath,
    model_type: str,
@ -885,7 +899,8 @@ def test_single_image_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_multi_image_models(
    tmp_path: PosixPath,
    model_type: str,
@ -911,7 +926,8 @@ def test_multi_image_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_image_embedding_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -935,7 +951,8 @@ def test_image_embedding_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_video_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -959,7 +976,8 @@ def test_video_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_audio_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -983,7 +1001,8 @@ def test_audio_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_single_image_models_heavy(
    tmp_path: PosixPath,
@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_multi_image_models_heavy(
    tmp_path: PosixPath,
@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_image_embedding_models_heavy(
    model_type: str,
@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_video_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -1109,7 +1132,8 @@ def test_video_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_audio_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
    model_type: str,
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest

-from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
-                          VllmRunner)
+from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

@ -64,50 +63,49 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=1,
-            dtype=dtype,
-            limit_mm_per_prompt={"audio": 1},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=64,
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"audio": 1},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=64,
+        enforce_eager=True,
    ) as vllm_model:
        lora_request = LoRARequest("audio", 1, audio_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, audios in inputs
        ]

-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
-
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id

        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    audios=[audios],
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[audios],
+                eos_token_id=eos_token_id,
+            )
            for prompts, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(output) for output in vllm_outputs
-            ],
+            outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
            name_0="hf",
            name_1="vllm",
        )
@ -118,9 +116,16 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, model: str,
-                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
    give the same result.
    """

-    image_cherry = convert_image_mode(
-        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
    images = [image_cherry, image_stop]
    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
        ),
    ]

-    with vllm_runner(model,
-                     runner="generate",
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": 2},
-                     max_model_len=32768,
-                     max_num_seqs=2,
-                     tensor_parallel_size=1,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        runner="generate",
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 2},
+        max_model_len=32768,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy(prompts,
-                                       max_tokens,
-                                       images=images,
-                                       videos=videos)
+            vllm_model.generate_greedy(
+                prompts, max_tokens, images=images, videos=videos
+            )
            for prompts, images, videos in inputs
        ]

    all_results = [output[0][1] for output in vllm_outputs_per_case]
-    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
-               for total_str in all_results]
-    prompt_lengths = [prompt_len for _, prompt_len in outputs]
-    generated_strs = [
-        total_str[prompt_len:] for total_str, prompt_len in outputs
+    outputs = [
+        (total_str, total_str.find("assistant\n") + len("assistant\n"))
+        for total_str in all_results
    ]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
    interleaved_output_str, noninterleaved_output_str = generated_strs

--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@ -18,13 +18,11 @@ from typing import Any
 import pytest
 import torch
 from safetensors.torch import save_file
-from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
-                          GenerationConfig)
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig

 from vllm import LLM, SamplingParams
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec)
+from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec

 from ....utils import multi_gpu_test

@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:


 def create_reduced_maverick_model(
-    original_model_name:
-    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
    output_dir: str = "/tmp/reduced_maverick",
    text_layers: int = 4,
    num_experts: int = 4,
@ -118,7 +115,8 @@ def create_reduced_maverick_model(

    print(
        f"Creating reduced Maverick model with {text_layers} text layers and "
-        f"{vision_layers} vision layers...")
+        f"{vision_layers} vision layers..."
+    )

    # Create output directory
    output_path = Path(output_dir)
@ -126,19 +124,23 @@ def create_reduced_maverick_model(
        if force_recreate:
            shutil.rmtree(output_path)
        else:
-            print(f"Output directory {output_dir} already exists. "
-                  "Use --force-recreate to overwrite.")
+            print(
+                f"Output directory {output_dir} already exists. "
+                "Use --force-recreate to overwrite."
+            )
            return str(output_path)

    output_path.mkdir(parents=True, exist_ok=True)

    try:
        print("Loading original model configuration...")
-        original_config = AutoConfig.from_pretrained(original_model_name,
-                                                     trust_remote_code=True)
+        original_config = AutoConfig.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
        print("Creating reduced configuration...")
-        reduced_config = create_reduced_config(original_config, text_layers,
-                                               num_experts, vision_layers)
+        reduced_config = create_reduced_config(
+            original_config, text_layers, num_experts, vision_layers
+        )

        config_path = output_path / "config.json"
        with open(config_path, "w") as f:
@ -149,8 +151,7 @@ def create_reduced_maverick_model(
        copy_tokenizer_files(original_model_name, output_path)

        print("Creating reduced safetensors files...")
-        create_reduced_safetensors(original_config, reduced_config,
-                                   output_path)
+        create_reduced_safetensors(original_config, reduced_config, output_path)

        print("Creating preprocessor config...")
        create_preprocessor_config(original_config, output_path)
@ -173,9 +174,9 @@ def create_reduced_maverick_model(
        raise


-def create_reduced_config(original_config: Any, text_layers: int,
-                          num_experts: int,
-                          vision_layers: int) -> dict[str, Any]:
+def create_reduced_config(
+    original_config: Any, text_layers: int, num_experts: int, vision_layers: int
+) -> dict[str, Any]:
    """Create a reduced configuration based on the original."""

    # Convert config to dictionary
@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
    if "text_config" in config_dict:
        original_text_layers = config_dict["text_config"]["num_hidden_layers"]
        config_dict["text_config"]["num_hidden_layers"] = text_layers
-        print(
-            f"Reduced text layers from {original_text_layers} to {text_layers}"
-        )
+        print(f"Reduced text layers from {original_text_layers} to {text_layers}")

        original_num_experts = config_dict["text_config"]["num_local_experts"]
        config_dict["text_config"]["num_local_experts"] = num_experts
-        print(
-            f"Reduced num experts from {original_num_experts} to {num_experts}"
-        )
+        print(f"Reduced num experts from {original_num_experts} to {num_experts}")

        hidden_dim_divisor = 4

        original_hidden_size = config_dict["text_config"]["hidden_size"]
        new_hidden_size = original_hidden_size // hidden_dim_divisor
        config_dict["text_config"]["hidden_size"] = new_hidden_size
-        print(f"Reduced hidden size from {original_hidden_size} to "
-              f"{new_hidden_size}")
+        print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")

        original_head_dim = config_dict["text_config"]["head_dim"]
        new_head_dim = original_head_dim // hidden_dim_divisor
@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,

    # Reduce vision layers
    if "vision_config" in config_dict:
-        original_vision_layers = config_dict["vision_config"][
-            "num_hidden_layers"]
+        original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
        config_dict["vision_config"]["num_hidden_layers"] = vision_layers
-        print(f"Reduced vision layers from {original_vision_layers} "
-              f"to {vision_layers}")
+        print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")

    # Update model name to indicate it's a reduced version
-    config_dict["_name_or_path"] = (
-        f"reduced_maverick_{text_layers}t_{vision_layers}v")
+    config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"

    return config_dict

@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
    """Copy tokenizer files from the original model."""

    try:
-        tokenizer = AutoTokenizer.from_pretrained(original_model_name,
-                                                  trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
        tokenizer.save_pretrained(output_path)
        print("Tokenizer files copied successfully")
    except Exception as e:
        print(f"Warning: Could not copy tokenizer files: {e}")


-def create_preprocessor_config(original_config: Any,
-                               output_path: Path) -> None:
+def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
    """Create preprocessor_config.json for multimodal model."""

    # Try to load the original preprocessor config
@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
        raise


-def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
-                                                                          Any],
-                               output_path: Path) -> None:
+def create_reduced_safetensors(
+    original_config: Any, reduced_config: dict[str, Any], output_path: Path
+) -> None:
    """Create safetensors files with weights for the reduced model."""

    print("Generating synthetic weights for reduced model...")
@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
    save_weights_to_safetensors(weights, output_path)


-def create_text_model_weights(
-        text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
    """Create synthetic weights for the text model with MoE structure."""

    weights = {}
@ -291,19 +283,18 @@ def create_text_model_weights(
    intermediate_size_mlp = text_config["intermediate_size_mlp"]
    num_layers = text_config["num_hidden_layers"]
    num_attention_heads = text_config["num_attention_heads"]
-    num_key_value_heads = text_config.get("num_key_value_heads",
-                                          num_attention_heads)
+    num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)

    # MoE specific parameters
    num_experts = text_config.get("num_local_experts")
-    assert (num_experts
-            is not None), "num_local_experts must be specified for MoE"
+    assert num_experts is not None, "num_local_experts must be specified for MoE"

    head_dim = hidden_size // num_attention_heads

    # Embedding layers
    weights["language_model.model.embed_tokens.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.float16)
+        vocab_size, hidden_size, dtype=torch.float16
+    )

    # Transformer layers
    for layer_idx in range(num_layers):
@ -312,95 +303,105 @@ def create_text_model_weights(

        # Self-attention weights (separate q, k, v projections)
        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
        print("Self-attention weights created.")

        # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
        # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
        # 0,2,4,... are dense
        interleave_step = text_config.get("interleave_moe_layer_step", 1)
-        is_moe_layer = (interleave_step > 0
-                        and (layer_idx + 1) % interleave_step == 0)
+        is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0

        if is_moe_layer:
            # MoE layer structure
            # 1. Router weights
-            weights[
-                f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
-                    num_experts, hidden_size, dtype=torch.float16)
+            weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                num_experts, hidden_size, dtype=torch.float16
+            )

            # 2. Individual expert weights (not fused)
            for expert_idx in range(num_experts):
-                expert_prefix = (
-                    f"{layer_prefix}.feed_forward.experts.{expert_idx}")
+                expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"

                weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
-                    hidden_size, intermediate_size, dtype=torch.bfloat16)
+                    hidden_size, intermediate_size, dtype=torch.bfloat16
+                )

                # Expert weight scales (FP8 quantization)
-                weights[
-                    f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
-                        intermediate_size, 1, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
-                    intermediate_size, 1, dtype=torch.bfloat16)
-                weights[
-                    f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
-                        hidden_size, 1, dtype=torch.bfloat16)
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                    hidden_size, 1, dtype=torch.bfloat16
+                )

            # 3. Shared expert weights
            shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
            weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
            weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
            weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
-                hidden_size, intermediate_size, dtype=torch.bfloat16)
+                hidden_size, intermediate_size, dtype=torch.bfloat16
+            )
            print(f"MoE feed-forward weights created for layer {layer_idx}.")
        else:
            # Dense layer structure
-            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
-                torch.randn(hidden_size,
-                            intermediate_size_mlp,
-                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
+            )
            print(f"Dense feed-forward weights created for layer {layer_idx}.")

        # Layer norms
        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
        print("Layer norms created.")

    # Final layer norm and output projection
    weights["language_model.model.norm.weight"] = torch.ones(
-        hidden_size, dtype=torch.bfloat16)
+        hidden_size, dtype=torch.bfloat16
+    )
    weights["language_model.lm_head.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.bfloat16)
+        vocab_size, hidden_size, dtype=torch.bfloat16
+    )

    return weights


 def create_vision_model_weights(
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    vision_config: dict[str, Any],
+) -> dict[str, torch.Tensor]:
    """Create synthetic weights for the vision model."""

    weights = {}
@ -414,47 +415,62 @@ def create_vision_model_weights(
        layer_prefix = f"vision_model.model.layers.{layer_idx}"

        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

        weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
-            intermediate_size, hidden_size, dtype=torch.bfloat16)
+            intermediate_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
-            intermediate_size, dtype=torch.bfloat16)
+            intermediate_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
-            hidden_size, intermediate_size, dtype=torch.bfloat16)
+            hidden_size, intermediate_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

    return weights


 def create_shared_weights(
-        text_config: dict[str, Any],
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    text_config: dict[str, Any], vision_config: dict[str, Any]
+) -> dict[str, torch.Tensor]:
    """Create weights for shared components (vision-language connector)"""

    weights = {}
@ -464,13 +480,15 @@ def create_shared_weights(

    # Vision-language connector (projects vision features to text space)
    weights["multi_modal_projector.linear_1.weight"] = torch.randn(
-        text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16
+    )

    return weights


-def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
-                                output_path: Path) -> None:
+def save_weights_to_safetensors(
+    weights: dict[str, torch.Tensor], output_path: Path
+) -> None:
    """Save weights to safetensors files and create index."""

    # Determine how to shard the weights
@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
    else:
        # Multiple shards
        for i, shard in enumerate(shards):
-            filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
+            filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
            save_file(shard, output_path / filename)
            for name in shard:
                weight_map[name] = filename
-            print(f"Saved shard {i+1}/{len(shards)}: {filename}")
+            print(f"Saved shard {i + 1}/{len(shards)}: {filename}")

    # Create index file
    index_data = {
        "metadata": {
-            "total_size":
-            sum(tensor.numel() * tensor.element_size()
-                for tensor in weights.values())
+            "total_size": sum(
+                tensor.numel() * tensor.element_size() for tensor in weights.values()
+            )
        },
        "weight_map": weight_map,
    }
@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
        json.dump(index_data, f, indent=2)

    print(f"Created index file: {index_path}")
-    print(f"Total model size: "
-          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
+    print(
+        f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
+    )


 def check_attention_spec_interleaved_rope(
@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
 ):
    """Check that the attention spec is correct."""
    assert isinstance(llm.llm_engine.model_executor, Executor)
-    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
-    )
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
    for rank in range(num_ranks):
        kv_cache_specs = kv_cache_specs_per_rank[rank]
        assert len(kv_cache_specs.keys()) == num_attention_layers
@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
            else:
                expected_spec = ChunkedLocalAttentionSpec
            assert isinstance(
-                kv_cache_specs[
-                    f"language_model.model.layers.{i}.self_attn.attn"],
-                expected_spec)
+                kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec,
+            )


 def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
    """Test the created reduced model with vLLM."""
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=50)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)

    if should_profile:
        llm.start_profile()
@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
    print("Test generation successful!")
    for output in outputs:
        print(f"Prompt: {output.prompt}")
-        print(f"Output: "
-              f"{output.outputs[0].text}")
+        print(f"Output: {output.outputs[0].text}")
        print("-" * 40)


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
    "original_model_name,text_layers,num_experts,vision_layers,",
-    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
+)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@ -640,7 +656,8 @@ def main():
    import argparse

    parser = argparse.ArgumentParser(
-        description="Create a reduced-layer Maverick model")
+        description="Create a reduced-layer Maverick model"
+    )
    parser.add_argument(
        "--output-dir",
        default="/tmp/reduced_maverick",
@ -652,10 +669,7 @@ def main():
        default=4,
        help="Number of text transformer layers",
    )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=4,
-                        help="Number of experts")
+    parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
    parser.add_argument(
        "--vision-layers",
        type=int,
@ -667,12 +681,12 @@ def main():
        action="store_true",
        help="Force recreation if output directory exists",
    )
-    parser.add_argument("--test",
-                        action="store_true",
-                        help="Test the created model with vLLM")
-    parser.add_argument("--profile",
-                        action="store_true",
-                        help="Profile the created model with vLLM")
+    parser.add_argument(
+        "--test", action="store_true", help="Test the created model with vLLM"
+    )
+    parser.add_argument(
+        "--profile", action="store_true", help="Profile the created model with vLLM"
+    )
    parser.add_argument(
        "--test-original",
        action="store_true",
@ -687,16 +701,18 @@ def main():
    args = parser.parse_args()

    if args.test:
-        test_dummy_maverick(original_model_name=args.original_model,
-                            output_dir=args.output_dir,
-                            text_layers=args.text_layers,
-                            num_experts=args.num_experts,
-                            vision_layers=args.vision_layers,
-                            force_recreate=args.force_recreate,
-                            tp=2,
-                            ep=True,
-                            enforce_eager=True,
-                            profile=args.profile)
+        test_dummy_maverick(
+            original_model_name=args.original_model,
+            output_dir=args.output_dir,
+            text_layers=args.text_layers,
+            num_experts=args.num_experts,
+            vision_layers=args.vision_layers,
+            force_recreate=args.force_recreate,
+            tp=2,
+            ep=True,
+            enforce_eager=True,
+            profile=args.profile,
+        )

    if args.test_original:
        run_maverick_serving(args.original_model)
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform

-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)

-model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
-                               revision="refs/pr/70")
+model_path = snapshot_download(
+    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+)
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]

 target_dtype = "half"
@ -48,8 +57,7 @@ if current_platform.is_rocm():
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
    model: str,
    *,
    max_model_len: int,
@ -75,28 +83,30 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
-            trust_remote_code=False,
+        model,
+        task="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
+        trust_remote_code=False,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, images, audios in inputs
        ]

@ -108,17 +118,18 @@ def run_test(
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id
        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+            )
            for prompts, images, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
@ -145,16 +156,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    run_test(
        hf_runner,
@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

    inputs_per_case = [
        (
            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
            None,
        ),
    ]
@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=16000)
    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform

-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)

 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]


-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
+):
    """Sanitize vllm output to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@ -71,8 +79,7 @@ if current_platform.is_rocm():
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
    model: str,
    *,
    max_model_len: int,
@ -98,27 +105,29 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, images, audios in inputs
        ]

@ -127,42 +136,36 @@ def run_test(
    pytest.skip("HF impl is not compatible with current transformers")

    hf_model_kwargs = {"_attn_implementation": "sdpa"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id

-        def patch_hf_processor(*args,
-                               text="",
-                               images=None,
-                               audio=None,
-                               sampling_rate=None,
-                               **kwargs):
+        def patch_hf_processor(
+            *args, text="", images=None, audio=None, sampling_rate=None, **kwargs
+        ):
            audios = None
            if audio is not None and sampling_rate is not None:
                audios = [(audio, sampling_rate)]
-            return hf_processor(*args,
-                                text=text,
-                                images=images,
-                                audios=audios,
-                                **kwargs)
+            return hf_processor(
+                *args, text=text, images=images, audios=audios, **kwargs
+            )

        hf_model.processor = patch_hf_processor

        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id,
-                                                    num_logits_to_keep=0)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+                num_logits_to_keep=0,
+            )
            for prompts, images, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
@ -189,16 +192,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    run_test(
        hf_runner,
@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

    inputs_per_case = [
        (
            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
            None,
        ),
    ]
@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=None)
    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."


 def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": PROMPT,
-        }] + [{
-            "type": "image_url",
-            "image_url": {
-                "url": url
-            }
-        } for url in urls],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": PROMPT,
+                }
+            ]
+            + [{"type": "image_url", "image_url": {"url": url}} for url in urls],
+        }
+    ]


 def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "content": PROMPT,
-        }, *({
-            "type": "image",
-            "image": download_image(url)
-        } for url in urls)],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "content": PROMPT,
+                },
+                *({"type": "image", "image": download_image(url)} for url in urls),
+            ],
+        }
+    ]


 def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text, [{
-        k: asdict(v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in (logprobs or [])])
-                 for tokens, text, logprobs in outputs]
+    json_data = [
+        (
+            tokens,
+            text,
+            [
+                {k: asdict(v) for k, v in token_logprobs.items()}
+                for token_logprobs in (logprobs or [])
+            ],
+        )
+        for tokens, text, logprobs in outputs
+    ]

    with open(filename, "w") as f:
        json.dump(json_data, f)
@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)

-    return [(tokens, text, [{
-        int(k): Logprob(**v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
+    return [
+        (
+            tokens,
+            text,
+            [
+                {int(k): Logprob(**v) for k, v in token_logprobs.items()}
+                for token_logprobs in logprobs
+            ],
+        )
+        for tokens, text, logprobs in json_data
+    ]


@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
-              local_asset_server) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
-        FIXTURE_LOGPROBS_CHAT[model])
+def test_chat(
+    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
-            max_model_len=max_model_len,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=max_model_len,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []

@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
-    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501


-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_5_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_5_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)


@pytest.mark.core_model
@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
-                                      video_pruning_rate: float,
-                                      num_frames: int, dtype: str,
-                                      max_tokens: int) -> None:
+def test_qwen2_5_vl_evs_functionality(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+) -> None:
    """Test EVS (Efficient Video Sampling) functionality with different
    pruning rates.
    """
@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
    videos = [sampled_vids[0]]

    # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=1,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 1},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 1},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
        # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got a response
        assert len(outputs) == 1
@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
-                                       video_pruning_rate: float,
-                                       num_frames: int, dtype: str,
-                                       max_tokens: int) -> None:
+def test_qwen2_5_vl_evs_batched_videos(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+) -> None:
    """Test EVS functionality with batched videos.

    This test validates that:
@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,

    # Test batched videos
    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
-    videos = [sampled_vids[0],
-              sampled_vids[0]]  # Use same video twice for testing
+    videos = [sampled_vids[0], sampled_vids[0]]  # Use same video twice for testing

    # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 2},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 2},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
        # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got responses for both videos
        assert len(outputs) == 2
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@ -11,8 +11,13 @@ from PIL import Image
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video

-from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
-                          PromptVideoInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    VIDEO_ASSETS,
+    PromptImageInput,
+    PromptVideoInput,
+    VllmRunner,
+)
 from ...utils import check_logprobs_close


@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501


-IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the biggest text's content in this image?",
-    ),
-    "cherry_blossom":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the season shown in this image? ",
-        "Reply with a short sentence (no more than 20 words)",
-    ),
-})
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the biggest text's content in this image?",
+        ),
+        "cherry_blossom": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the season shown in this image? ",
+            "Reply with a short sentence (no more than 20 words)",
+        ),
+    }
+)

-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)

 MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
    IMAGE_PLACEHOLDER,
@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):


 def batch_make_image_embeddings(
-        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    image_batches: list[Union[Image.Image, list[Image.Image]]],
+    processor,
+    llm: VllmRunner,
+) -> list[Qwen2VLPromptImageEmbeddingInput]:
    """batched image embeddings for Qwen2-VL

-    This will infer all images' embeddings in a single batch, 
+    This will infer all images' embeddings in a single batch,
      and split the result according to input batches.

    image_batches:
      - Single-image batches: `list[Image.Image]`
      - Multiple-image batches: `list[list[Image.Image]]]`
-    
+
    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
    """

@ -108,9 +116,9 @@ def batch_make_image_embeddings(
    # image to pixel values
    image_processor = processor.image_processor

-    preprocess_result = image_processor \
-        .preprocess(images=images, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=images, return_tensors="pt"
+    ).data
    pixel_values = preprocess_result["pixel_values"]
    image_grid_thw = preprocess_result["image_grid_thw"]

@ -119,12 +127,13 @@ def batch_make_image_embeddings(
        with torch.no_grad():
            visual = model.visual

-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=image_grid_thw_on_device
+            ).cpu()

    image_embeds = torch.concat(llm.apply_model(get_image_embeds))

@ -137,16 +146,21 @@ def batch_make_image_embeddings(
        merge_size = image_processor.merge_size
        cur_batch_embed_len = sum(
            grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count])
+            for grid_thw in image_grid_thw[
+                image_counter : image_counter + cur_batch_image_count
+            ]
+        )

-        result.append({
-            "image_embeds":
-            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "image_grid_thw":
-            image_grid_thw[image_counter:image_counter +
-                           cur_batch_image_count],
-        })
+        result.append(
+            {
+                "image_embeds": image_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "image_grid_thw": image_grid_thw[
+                    image_counter : image_counter + cur_batch_image_count
+                ],
+            }
+        )

        embed_counter += cur_batch_embed_len
        image_counter += cur_batch_image_count
@ -160,13 +174,13 @@ def batch_make_image_embeddings(


 def batch_make_video_embeddings(
-        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    video_batches: PromptVideoInput, processor, llm: VllmRunner
+) -> list[Qwen2VLPromptVideoEmbeddingInput]:
    """batched video embeddings for Qwen2-VL

    A NDArray represents a single video's all frames.

-    This will infer all videos' embeddings in a single batch, 
+    This will infer all videos' embeddings in a single batch,
      and split the result according to input batches.

    video_batches:
@ -191,9 +205,9 @@ def batch_make_video_embeddings(
    # video to pixel values
    image_processor = processor.image_processor

-    preprocess_result = image_processor \
-        .preprocess(images=None, videos=videos, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=None, videos=videos, return_tensors="pt"
+    ).data
    pixel_values = preprocess_result["pixel_values_videos"]
    video_grid_thw = preprocess_result["video_grid_thw"]

@ -202,12 +216,13 @@ def batch_make_video_embeddings(
        with torch.no_grad():
            visual = model.visual

-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=video_grid_thw_on_device
+            ).cpu()

    video_embeds = torch.concat(llm.apply_model(get_image_embeds))

@ -220,16 +235,21 @@ def batch_make_video_embeddings(
        merge_size = image_processor.merge_size
        cur_batch_embed_len = sum(
            grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count])
+            for grid_thw in video_grid_thw[
+                video_counter : video_counter + cur_batch_video_count
+            ]
+        )

-        result.append({
-            "video_embeds":
-            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "video_grid_thw":
-            video_grid_thw[video_counter:video_counter +
-                           cur_batch_video_count],
-        })
+        result.append(
+            {
+                "video_embeds": video_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "video_grid_thw": video_grid_thw[
+                    video_counter : video_counter + cur_batch_video_count
+                ],
+            }
+        )

        embed_counter += cur_batch_embed_len
        video_counter += cur_batch_video_count
@ -263,25 +283,24 @@ def run_embedding_input_test(

    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=4000,
-            max_num_seqs=3,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            default_torch_num_threads=1,
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=3,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        default_torch_num_threads=1,
    ) as vllm_model:
        outputs_per_case_for_original_input = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None,
+            )
            for prompts, images, videos in inputs
        ]

@ -290,17 +309,19 @@ def run_embedding_input_test(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
-                images=batch_make_image_embeddings(
-                    images, processor, vllm_model) if images else None,
-                videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model) if videos else None)
+                images=batch_make_image_embeddings(images, processor, vllm_model)
+                if images
+                else None,
+                videos=batch_make_video_embeddings(videos, processor, vllm_model)
+                if videos
+                else None,
+            )
            for prompts, images, videos in inputs
        ]

-    for outputs_for_original_input, \
-        outputs_for_embeddings_input \
-        in zip(outputs_per_case_for_original_input,
-            outputs_per_case_for_embeddings_input):
+    for outputs_for_original_input, outputs_for_embeddings_input in zip(
+        outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
+    ):
        check_logprobs_close(
            outputs_0_lst=outputs_for_original_input,
            outputs_1_lst=outputs_for_embeddings_input,
@ -325,17 +346,26 @@ def run_embedding_input_test(
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype, max_tokens,
-                                         num_logprobs, monkeypatch) -> None:
+def test_qwen2_vl_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    monkeypatch,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
            [],
-        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+        )
+        for image, prompt in zip(images, IMAGE_PROMPTS)
+    ]

    run_embedding_input_test(
        vllm_runner,
@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
-                                                  model, size_factors,
-                                                  dtype: str, max_tokens: int,
-                                                  num_logprobs: int) -> None:
+def test_qwen2_vl_multiple_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_case: list[tuple[list[str], PromptImageInput,
-                                PromptVideoInput]] = [(
-                                    [MULTIIMAGE_PROMPT for _ in size_factors],
-                                    [[
-                                        rescale_image_size(image, factor)
-                                        for image in images
-                                    ] for factor in size_factors],
-                                    [],
-                                )]
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [MULTIIMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            [],
+        )
+    ]

    run_embedding_input_test(
        vllm_runner,
@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+def test_qwen2_vl_video_embeddings_input(
+    vllm_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    num_frames = 4
    sampled_vids = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]

-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
            [prompt for _ in size_factors],
            [],
            [rescale_video_size(video, factor) for factor in size_factors],
-        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+        )
+        for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
+    ]

    run_embedding_input_test(
        vllm_runner,
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS

 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"

-AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
-    "mary_had_lamb":
-    "Transcribe this into English.",
-    "winning_call":
-    "What is happening in this audio clip?",
-})
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": "Transcribe this into English.",
+        "winning_call": "What is happening in this audio clip?",
+    }
+)

 MULTI_AUDIO_PROMPT = "Describe each of the audios above."

@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
    "enable_chunked_prefill": True,
    "max_num_seqs": 2,
    # Use a very small limit to exercise chunked prefill.
-    "max_num_batched_tokens": 16
+    "max_num_batched_tokens": 16,
 }


@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
    for key, value in params_kwargs.items():
        if isinstance(value, bool):
            if value:
-                args.append(f"--{key.replace('_','-')}")
+                args.append(f"--{key.replace('_', '-')}")
        else:
-            args.append(f"--{key.replace('_','-')}={value}")
+            args.append(f"--{key.replace('_', '-')}={value}")
    return args


-@pytest.fixture(params=[
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
+@pytest.fixture(
+    params=[
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ]
+)
 def server(request, audio_assets: AudioTestAssets):
    args = [
-        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
        "--limit-mm-per-prompt",
-        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+        json.dumps({"audio": len(audio_assets)}),
+        "--trust-remote-code",
    ] + params_kwargs_to_cli_args(request.param)

-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
        yield remote_server


@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    placeholder = f"{placeholder}\n" * audio_count

-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": f"{placeholder}{question}"}],
+        tokenize=False,
+        add_generation_prompt=True,
+    )


 def run_multi_audio_test(
@ -99,19 +104,21 @@ def run_multi_audio_test(
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={
-                         "audio":
-                         max((len(audio) for _, audio in prompts_and_audios))
-                     },
-                     **kwargs) as vllm_model:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={
+            "audio": max((len(audio) for _, audio in prompts_and_audios))
+        },
+        **kwargs,
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            [prompt for prompt, _ in prompts_and_audios],
            max_tokens,
            num_logprobs=num_logprobs,
-            audios=[audios for _, audios in prompts_and_audios])
+            audios=[audios for _, audios in prompts_and_audios],
+        )

    # The HuggingFace model doesn't support multiple audios yet, so
    # just assert that some tokens were generated.
@ -122,21 +129,25 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int, num_logprobs: int,
-                                     vllm_kwargs: dict) -> None:
-
-    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
-                              VLLM_PLACEHOLDER)
+@pytest.mark.parametrize(
+    "vllm_kwargs",
+    [
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ],
+)
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    vllm_kwargs: dict,
+) -> None:
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
    run_multi_audio_test(
        vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
 async def test_online_serving(client, audio_assets: AudioTestAssets):
    """Exercises online serving with/without chunked prefill enabled."""

-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *[{
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio.url
-                }
-            } for audio in audio_assets],
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[
+                    {"type": "audio_url", "audio_url": {"url": audio.url}}
+                    for audio in audio_assets
+                ],
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]

-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@ -6,8 +6,12 @@ import json
 import pytest
 import pytest_asyncio
 from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.messages import (
+    AudioChunk,
+    RawAudio,
+    TextChunk,
+    UserMessage,
+)

 from vllm.transformers_utils.tokenizer import MistralTokenizer

@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test

 MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]


@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
        json.dumps({"audio": len(audio_assets)}),
    ] + MISTRAL_FORMAT_ARGS

-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
        yield remote_server


@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
    run_multi_audio_test(
        vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
        return audio_dict

    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *audio_chunks,
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *audio_chunks,
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]

-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test

 PROMPTS = [
    {
-        "prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
        "multi_modal_data": {
            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
        },
@ -25,9 +24,8 @@ PROMPTS = [
                "audio": AudioAsset("winning_call").audio_and_sample_rate,
            },
        },
-        "decoder_prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    }
+        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    },
 ]

 EXPECTED = {
@ -41,7 +39,7 @@ EXPECTED = {
        " is June and the third base. They're going to wave him in. The throw"
        " to the plate will be late. The Mariners are going to play for the"
        " American League Championship. I don't believe it. It just continues"
-        " by all five."
+        " by all five.",
    ],
    "openai/whisper-small": [
        " The first words I spoke in the original pornograph. A little piece"
@ -51,7 +49,7 @@ EXPECTED = {
        " comes joy. Here is Junior to third base. They're gonna wave him"
        " in. The throw to the plate will be late. The Mariners are going to"
        " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my."
+        " just continues. My, oh my.",
    ],
    "openai/whisper-medium": [
        " The first words I spoke in the original phonograph, a little piece"
@ -62,7 +60,7 @@ EXPECTED = {
        " Jorgen at third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh"
-        " my."
+        " my.",
    ],
    "openai/whisper-large-v3": [
        " The first words I spoke in the original phonograph, a little piece"
@ -73,7 +71,7 @@ EXPECTED = {
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
+        " my.",
    ],
    "openai/whisper-large-v3-turbo": [
        " The first words I spoke in the original phonograph, a little piece"
@ -84,8 +82,8 @@ EXPECTED = {
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
-    ]
+        " my.",
+    ],
 }


@ -100,11 +98,11 @@ def run_test(
    expected_list = EXPECTED[model] * 10

    with vllm_runner(
-            model,
-            dtype="half",
-            max_model_len=448,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype="half",
+        max_model_len=448,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        llm = vllm_model.llm

--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Helpers for building inputs that can be leveraged for different test types.
-"""
+"""Helpers for building inputs that can be leveraged for different test types."""
+
 from collections.abc import Iterable
 from pathlib import PosixPath
 from typing import Callable, Optional, Union
@ -10,20 +10,30 @@ import torch

 from vllm.multimodal.audio import AudioResampler
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)

 from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
-from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
-                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
-                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
-                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
-                    VLMTestInfo)
+from .types import (
+    SINGLE_AUDIO_BASE_PROMPT,
+    SINGLE_IMAGE_BASE_PROMPTS,
+    TEST_AUDIO_PLACEHOLDER,
+    TEST_IMG_PLACEHOLDER,
+    TEST_VIDEO_PLACEHOLDER,
+    VIDEO_BASE_PROMPT,
+    ImageSizeWrapper,
+    PromptWithMultiModalInput,
+    SizeType,
+    VLMTestInfo,
+)


-def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
-                                                                     str],
-                             test_placeholder: str) -> str:
+def replace_test_placeholder(
+    prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
+) -> str:
    """Given a prompt, replaces each test placeholder with the
    model-specific tag.
    """
@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
    return img_prompt


-def get_model_prompts(base_prompts: Iterable[str],
-                      img_idx_to_prompt: Optional[Callable[[int], str]],
-                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      audio_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> list[str]:
+def get_model_prompts(
+    base_prompts: Iterable[str],
+    img_idx_to_prompt: Optional[Callable[[int], str]],
+    video_idx_to_prompt: Optional[Callable[[int], str]],
+    audio_idx_to_prompt: Optional[Callable[[int], str]],
+    prompt_formatter: Callable[[str], str],
+) -> list[str]:
    """Given a model-agnostic base prompt and test configuration for a model(s)
    to be tested, update the media placeholders and apply the prompt formatting
    to get the test prompt string for this model.
@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
        # Replace the multimodal placeholders in the base prompt with
        # the correct ones for the model that we are testing
        if img_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   img_idx_to_prompt,
-                                                   TEST_IMG_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
+            )

        if video_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   video_idx_to_prompt,
-                                                   TEST_VIDEO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
+            )

        if audio_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   audio_idx_to_prompt,
-                                                   TEST_AUDIO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
+            )

        # Apply the prompt formatter to wrap the base prompt with
        # the correct media placeholders to get the model test prompt
@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
    tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build single image inputs")
+        raise ValueError("Prompt formatter must be set to build single image inputs")

-    model_prompts = get_model_prompts(test_info.single_image_prompts,
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        test_info.single_image_prompts,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )

    # For models that require a local path / URL encoded in the image; export
    # assets and encode into tmp_path for this test. This should be avoided
@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(


 def build_single_image_inputs(
-        images, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    images, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
    # For every image / prompt pair, get a pair containing two lists of
    # length size_factors, where the first contains duplicates of the model
    # prompt [str], and the second contains copies of the image after being
@ -125,7 +138,8 @@ def build_single_image_inputs(
                apply_image_size_scaling(image, size, size_wrapper.type)
                for size in size_wrapper.data
            ],
-        ) for image, prompt in zip(images, model_prompts)
+        )
+        for image, prompt in zip(images, model_prompts)
    ]


@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
    tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build multi image inputs")
+        raise ValueError("Prompt formatter must be set to build multi image inputs")

-    model_prompts = get_model_prompts([test_info.multi_image_prompt],
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        [test_info.multi_image_prompt],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )

    if test_info.prompt_path_encoder is not None:
        if tmp_path is None:
@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(


 def build_multi_image_inputs(
-        image_lists, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    image_lists, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
    return [
        PromptWithMultiModalInput(
            prompts=[prompt for _ in size_wrapper.data],
-            image_data=[[
-                apply_image_size_scaling(image, size, size_wrapper.type)
-                for image in images
-            ] for size in size_wrapper.data],
-        ) for images, prompt in zip(image_lists, model_prompts)
+            image_data=[
+                [
+                    apply_image_size_scaling(image, size, size_wrapper.type)
+                    for image in images
+                ]
+                for size in size_wrapper.data
+            ],
+        )
+        for images, prompt in zip(image_lists, model_prompts)
    ]


@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
    # These conditions will always be true if invoked through filtering,
    # but we still check them in case this is ever called directly
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build image embedding inputs")
-    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
-            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
+        factor == 1.0 for factor in size_wrapper.data
+    ):
        raise ValueError("Embedding tests require constant (1.0) size factors")
    if test_info.convert_assets_to_embeddings is None:
        raise ValueError("No conversion func for getting embeddings found")
@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
    assert len(images) == len(model_prompts)

    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
-    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
-                                                size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
    return inputs, vllm_embeddings


@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
        for asset in video_assets
    ]

-    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
-                    else rescale_video_size)
+    video_scaler = (
+        resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
+    )

    return [
        PromptWithMultiModalInput(
            prompts=[prompt for _ in size_wrapper.data],
-            video_data=[
-                video_scaler(video, size) for size in size_wrapper.data
-            ],
-        ) for video, prompt in zip(sampled_vids, model_prompts)
+            video_data=[video_scaler(video, size) for size in size_wrapper.data],
+        )
+        for video, prompt in zip(sampled_vids, model_prompts)
    ]


-def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
-                             size_type: SizeType):
+def apply_image_size_scaling(
+    image, size: Union[float, tuple[int, int]], size_type: SizeType
+):
    """Applies a size scaler to one image; this can be an image size factor,
    which scales the image while maintaining the aspect ratio"""
    # Special case for embeddings; if it's a tensor, it's only valid if we
@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
        method="librosa",
    )
    audios = [asset.audio_and_sample_rate for asset in audio_assets]
-    resampled_audios = [(
-        resampler.resample(
-            audio,
-            orig_sr=sr,
-        ),
-        int(resampler.target_sr),
-    ) for audio, sr in audios]
+    resampled_audios = [
+        (
+            resampler.resample(
+                audio,
+                orig_sr=sr,
+            ),
+            int(resampler.target_sr),
+        )
+        for audio, sr in audios
+    ]

    return [
        PromptWithMultiModalInput(
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@ -4,19 +4,28 @@
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
 """
+
 import itertools
 from collections import OrderedDict
 from collections.abc import Iterable

 import pytest

-from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
-                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+from .types import (
+    EMBEDDING_SIZE_FACTORS,
+    ExpandableVLMTestArgs,
+    ImageSizeWrapper,
+    SizeType,
+    VLMTestInfo,
+    VLMTestType,
+)


 def get_filtered_test_settings(
-        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    new_proc_per_test: bool,
+) -> dict[str, VLMTestInfo]:
    """Given the dict of potential test settings to run, return a subdict
    of tests who have the current test type enabled with the matching val for
    fork_per_test.
@ -25,7 +34,8 @@ def get_filtered_test_settings(
    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
        return test_info.test_type == test_type or (
            isinstance(test_info.test_type, Iterable)
-            and test_type in test_info.test_type)
+            and test_type in test_info.test_type
+        )

    matching_tests = {}
    for test_name, test_info in test_settings.items():
@ -36,62 +46,69 @@ def get_filtered_test_settings(
                assert test_info.convert_assets_to_embeddings is not None
            # Custom test inputs need to explicitly define the mm limit/inputs
            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
-                assert (test_info.custom_test_opts is not None
-                        and isinstance(test_info.custom_test_opts, Iterable))
+                assert test_info.custom_test_opts is not None and isinstance(
+                    test_info.custom_test_opts, Iterable
+                )
            # For all types besides custom inputs, we need a prompt formatter
            else:
                assert test_info.prompt_formatter is not None

            # Everything looks okay; keep if this is correct proc handling
-            if (test_info.distributed_executor_backend
-                    is not None) == new_proc_per_test:
+            if (
+                test_info.distributed_executor_backend is not None
+            ) == new_proc_per_test:
                matching_tests[test_name] = test_info

    return matching_tests


-def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
-                             test_type: VLMTestType,
-                             create_new_process_for_each_test: bool):
+def get_parametrized_options(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    create_new_process_for_each_test: bool,
+):
    """Converts all of our VLMTestInfo into an expanded list of parameters.
    This is similar to nesting pytest parametrize calls, but done directly
    through an itertools product so that each test can set things like
    size factors etc, while still running in isolated test cases.
    """
    matching_tests = get_filtered_test_settings(
-        test_settings, test_type, create_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test
+    )

    # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)

    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
        # This is essentially the same as nesting a bunch of mark.parametrize
        # decorators, but we do it programmatically to allow overrides for on
        # a per-model basis, while still being able to execute each of these
        # as individual test cases in pytest.
-        iter_kwargs = OrderedDict([
-            ("model", ensure_wrapped(test_info.models)),
-            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-            ("dtype", ensure_wrapped(test_info.dtype)),
-            ("distributed_executor_backend",
-             ensure_wrapped(test_info.distributed_executor_backend)),
-        ])
+        iter_kwargs = OrderedDict(
+            [
+                ("model", ensure_wrapped(test_info.models)),
+                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+                ("dtype", ensure_wrapped(test_info.dtype)),
+                (
+                    "distributed_executor_backend",
+                    ensure_wrapped(test_info.distributed_executor_backend),
+                ),
+            ]
+        )

        # num_frames is video only
        if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(
-                test_info.num_video_frames)
+            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)

        # No sizes passed for custom inputs, since inputs are directly provided
        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
            if wrapped_sizes is None:
-                raise ValueError(
-                    f"Sizes must be set for test type {test_type}")
+                raise ValueError(f"Sizes must be set for test type {test_type}")
            iter_kwargs["size_wrapper"] = wrapped_sizes

-        #Otherwise expand the custom test options instead
+        # Otherwise expand the custom test options instead
        elif test_type == VLMTestType.CUSTOM_INPUTS:
            if test_info.custom_test_opts is None:
                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],


 def get_wrapped_test_sizes(
-        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
+    test_info: VLMTestInfo, test_type: VLMTestType
+) -> tuple[ImageSizeWrapper, ...]:
    """Given a test info which may have size factors or fixed sizes, wrap them
    and combine them into an iterable, each of which will be used in parameter
    expansion.
@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
    """
    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
    if test_type == VLMTestType.EMBEDDING:
-        return tuple([
-            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
-            for factor in EMBEDDING_SIZE_FACTORS
-        ])
+        return tuple(
+            [
+                ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+                for factor in EMBEDDING_SIZE_FACTORS
+            ]
+        )
    # Audio and Custom inputs have preprocessed inputs
    elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
        return tuple()

-    size_factors = test_info.image_size_factors \
-        if test_info.image_size_factors else []
-    fixed_sizes = test_info.image_sizes \
-        if test_info.image_sizes else []
+    size_factors = test_info.image_size_factors if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes if test_info.image_sizes else []

    wrapped_factors = [
        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
    ]

    wrapped_sizes = [
-        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
-        for size in fixed_sizes
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
    ]

    return tuple(wrapped_factors + wrapped_sizes)
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
+
 from typing import Any, Callable, Optional

 import torch
@ -70,22 +71,23 @@ def run_test(
    if model_info.hf_overrides:
        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
    if model_info.skip_tokenizer_init:
-        vllm_runner_kwargs_[
-            "skip_tokenizer_init"] = model_info.skip_tokenizer_init
+        vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init

    if vllm_runner_kwargs:
        vllm_runner_kwargs_.update(vllm_runner_kwargs)

-    with vllm_runner(model,
-                     max_model_len=max_model_len,
-                     max_num_seqs=max_num_seqs,
-                     dtype=dtype,
-                     limit_mm_per_prompt=limit_mm_per_prompt,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=enforce_eager,
-                     runner=runner,
-                     **vllm_runner_kwargs_) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        dtype=dtype,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=enforce_eager,
+        runner=runner,
+        **vllm_runner_kwargs_,
+    ) as vllm_model:
        tokenizer = vllm_model.llm.get_tokenizer()

        vllm_kwargs: dict[str, Any] = {}
@ -95,21 +97,19 @@ def run_test(
            vllm_kwargs["stop"] = stop_str

        for prompts, image_data, video_data, audio_data in vllm_inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
            vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
            vllm_output = vllm_model.generate_greedy_logprobs(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
-                **vllm_kwargs_with_mm_data)
+                **vllm_kwargs_with_mm_data,
+            )
            vllm_outputs_per_mm.append(vllm_output)

-    hf_model = hf_runner(model,
-                         dtype=dtype,
-                         auto_cls=auto_cls,
-                         model_kwargs=hf_model_kwargs)
+    hf_model = hf_runner(
+        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+    )

    # Some models need to patch things like the model processor, e.g., internvl
    if patch_hf_runner is not None:
@ -129,16 +129,15 @@ def run_test(
            hf_kwargs["stop_strings"] = stop_str

        for prompts, image_data, video_data, audio_data in inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
            hf_kwargs_with_mm_data = hf_kwargs | mm_data
            hf_output = hf_model.generate_greedy_logprobs_limit(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                tokenizer=tokenizer,
-                **hf_kwargs_with_mm_data)
+                **hf_kwargs_with_mm_data,
+            )
            hf_outputs_per_mm.append(hf_output)

    # Apply output processing / sanitation to the vLLM and HF runner results
@ -150,8 +149,7 @@ def run_test(
        second_runner_processor=vllm_output_post_proc,
    )

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
-                                        vllm_outputs_per_mm):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
        # This is usually check_logprobs_close, but it's passed through to
        # allow things like check_outputs_equal where needed
        comparator(
@ -171,15 +169,19 @@ def process_runner_outputs(
 ):
    """Applies the runner processor(s) to the runner outputs, if any."""
    if first_runner_processor is not None:
-        first_runner_outputs = process_outputs(first_runner_processor, model,
-                                               first_runner_outputs)
+        first_runner_outputs = process_outputs(
+            first_runner_processor, model, first_runner_outputs
+        )
    if second_runner_processor is not None:
-        second_runner_outputs = process_outputs(second_runner_processor, model,
-                                                second_runner_outputs)
+        second_runner_outputs = process_outputs(
+            second_runner_processor, model, second_runner_outputs
+        )
    return first_runner_outputs, second_runner_outputs


 def process_outputs(output_processor, model, outputs_per_image):
    """Applies a model specific post-processor function to a runner's output"""
-    return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+    return [
+        [output_processor(res, model) for res in outputs]
+        for outputs in outputs_per_image
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
+
 from typing import Callable

 from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)

 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType

 def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
-    
+
    Args:
        formatter: model-specific prompt formatter.
    """
@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
            stop_sign,
            rescale_image_size(stop_sign, 0.25),
            cherry_blossom.resize((183, 488)),
-            cherry_blossom.resize((488, 183))
+            cherry_blossom.resize((488, 183)),
        ],
        cherry_blossom,
    ]
@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
    ]


-def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
-                                          num_frames: int = 16):
+def multi_video_multi_aspect_ratio_inputs(
+    formatter: Callable[[str], str], num_frames: int = 16
+):
    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
-    
+
    Args:
        formatter: model-specific prompt formatter.
    """
@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
            video,
            rescale_video_size(video, 0.25),
            resize_video(video, (183, 488)),
-            resize_video(video, (488, 183))
+            resize_video(video, (488, 183)),
        ],
        video,
    ]
@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],

 def different_patch_input_cases_internvl():
    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
-    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    formatter = (
+        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    )  # noqa: E501
    single_img_prompts = [
        "<image>\nWhat's the content in the center of the image?",
        "<image>\nWhat is the season?",
@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():


 def windows_attention_image_qwen2_5_vl():
-
    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
    image = ImageAsset("hato").pil_image

    question = "Describe the image."
    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )

    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
    return build_single_image_inputs([image], [prompt], wrapped_sf)
@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"

    scales = [0.1, 0.2, 0.25]
-    video_input = [[(rescale_video_size(video_array, scale), metadata)]
-                   for scale in scales]
+    video_input = [
+        [(rescale_video_size(video_array, scale), metadata)] for scale in scales
+    ]
    prompts = [formatted_prompt] * len(video_input)

    return [
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@ -4,6 +4,7 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
+
 import types
 from pathlib import PosixPath
 from typing import Optional, Union
@ -15,8 +16,13 @@ import pytest
 import regex as re
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig, GenerationMixin)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    BatchFeature,
+    GenerationConfig,
+    GenerationMixin,
+)
 from transformers.video_utils import VideoMetadata

 from vllm.logprobs import SampleLogprobs
@ -27,8 +33,7 @@ from .types import RunnerOutput


 ####### vLLM output processors functions
-def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
-                           model: str) -> RunnerOutput:
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,


 def qwen_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(


 def qwen2_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(


 def kimiv_vl_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
    return output_ids, hf_output_str, out_logprobs


-def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                  model: str) -> RunnerOutput:
+def llava_image_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.image_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


 def llava_video_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.video_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


-def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
-                             mm_token_id: int) -> RunnerOutput:
+def _llava_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str, mm_token_id: int
+) -> RunnerOutput:
    """Sanitize vllm output [Llava models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
    ]

@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
    return config.to_dict()


-def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                      model: str) -> RunnerOutput:
+def llava_onevision_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    """Sanitize vllm output [llava-onevision] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
    ]

@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [mantis] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
    return output_ids, hf_output_str, out_logprobs


-def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [phi3v] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]

@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,


 ####### Post-processors for HF outputs
-def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<｜end▁of▁sentence｜>"):
        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
    return output_ids, output_str, out_logprobs


-def idefics3_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_utterance>"):
        output_str = output_str.split("<end_of_utterance>")[0]
    return output_ids, output_str, out_logprobs


-def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    # Based on Idefics3
    return idefics3_trunc_hf_output(hf_output, model)


-def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<|eot_id|>"):
        output_str = output_str.split("<|eot_id|>")[0]
    return output_ids, output_str, out_logprobs


-def minimax_vl_01_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_sentence>"):
        output_str = output_str.split("<end_of_sentence>")[0]
    return output_ids, output_str, out_logprobs


-def ultravox_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output

    tokenizer = AutoTokenizer.from_pretrained(model)
@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):

 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str,
-        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
+) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
    the HF version of Qwen-VL can resolve the path and load the image in its
@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        return BatchFeature(data=inputs, tensor_type="pt")

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language.model.embed_tokens
+    )
    return hf_model


@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        assert len(contents) == len(images)

        return hf_processor.apply_chat_template(
-            [{
-                "role": "user",
-                "image": image,
-                "content": content
-            } for image, content in zip(images, contents)],
+            [
+                {"role": "user", "image": image, "content": content}
+                for image, content in zip(images, contents)
+            ],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        )

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.transformer.output_layer
+    )
    return hf_model


@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        else:
            video_metadata = None

-        return hf_processor(*args,
-                            videos=videos,
-                            video_metadata=video_metadata,
-                            **kwargs)
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )

    hf_model.processor = processor
    return hf_model
@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.use_msac = self.config.use_msac
@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_h2ovl,
+            )

            # yapf: enable
            images = [images] if isinstance(images, Image) else images
@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
                    use_msac=self.use_msac,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_skyworkr1v)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_skyworkr1v,
+            )
+
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
                image_to_pixel_values_skyworkr1v(
@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    min_num=self.min_num,
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            **kwargs,
        ):
            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_internvl,
+                video_to_pixel_values_internvl,
+            )
+
            images = [images] if isinstance(images, Image) else images
            videos = [videos] if isinstance(videos, np.ndarray) else videos
            if images is not None:
@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=self.min_num,
                        max_num=self.max_num,
                        use_thumbnail=self.use_thumbnail,
-                    ) for image in images
+                    )
+                    for image in images
                ]
                num_patches_images = [
                    pixel_value.shape[0] for pixel_value in pixel_values_images
@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=1,
                        max_num=1,
                        use_thumbnail=False,
-                    ) for video in videos
+                    )
+                    for video in videos
                ]
                num_patches_videos = [
                    pixel_value.shape[0] for pixel_value in pixel_values_videos
@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            while ("<image>" in text) or ("<video>" in text):
                image_index = text.find("<image>")
                video_index = text.find("<video>")
-                if image_index == -1 or (video_index > -1
-                                         and video_index < image_index):
+                if image_index == -1 or (
+                    video_index > -1 and video_index < image_index
+                ):
                    num_patches = num_patches_videos.pop(0)
                    pixel_values.append(pixel_values_videos.pop(0))
-                    context_tokens = IMG_START + \
-                        IMG_CONTEXT * self.num_image_token + IMG_END
-                    video_tokens = ''.join([
-                        f'Frame{i+1}: {context_tokens}'
-                        for i in range(num_patches)
-                    ])
-                    text = text.replace('<video>', video_tokens, 1)
+                    context_tokens = (
+                        IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
+                    )
+                    video_tokens = "".join(
+                        [f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
+                    )
+                    text = text.replace("<video>", video_tokens, 1)
                else:
                    num_patches = num_patches_images.pop(0)
                    pixel_values.append(pixel_values_images.pop(0))
-                    context_tokens = IMG_CONTEXT * self.num_image_token \
-                        * num_patches
+                    context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                    image_tokens = IMG_START + context_tokens + IMG_END
-                    text = text.replace('<image>', image_tokens, 1)
+                    text = text.replace("<image>", image_tokens, 1)
            pixel_values = torch.cat(pixel_values, dim=0)

            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@ -631,7 +641,7 @@ def _internvl_generate(
    input_embeds = input_embeds.reshape(B * N, C)

    input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
+    selected = input_ids == self.img_context_token_id
    assert selected.sum() != 0
    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)

@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, **kwargs):
        text_tokenizer = hf_model.model.get_text_tokenizer()
@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                break

        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
-            text_or_conversations=text, images=images)
+            text_or_conversations=text, images=images
+        )
        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

        inputs = {
@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, videos=None, **kwargs):
        if images is None:
@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            videos = []
        else:
            videos = [videos] if isinstance(videos, np.ndarray) else videos
-            videos = [[PIL.Image.fromarray(frame) for frame in vid]
-                      for vid in videos]
+            videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        images_message = [{"type": "image", "image": img} for img in images]
        videos_message = [{"type": "video", "video": vid} for vid in videos]

-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *images_message,
-                *videos_message,
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ],
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    *images_message,
+                    *videos_message,
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]

        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
-            messages=messages, enable_thinking=True)
+            messages=messages, enable_thinking=True
+        )
        inputs = {
            "inputs": input_ids,
            "pixel_values": pixel_values,
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@ -3,23 +3,34 @@
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
+
 from pathlib import PosixPath

-from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
-                           VideoTestAssets, VllmRunner)
+from .....conftest import (
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo


 ####### Entrypoints for running different test types
-def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                          test_case: ExpandableVLMTestArgs,
-                          hf_runner: type[HfRunner],
-                          vllm_runner: type[VllmRunner],
-                          image_assets: ImageTestAssets):
+def run_single_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs = builders.build_single_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )

    core.run_test(
        hf_runner=hf_runner,
@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": 1},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                         test_case: ExpandableVLMTestArgs,
-                         hf_runner: type[HfRunner],
-                         vllm_runner: type[VllmRunner],
-                         image_assets: ImageTestAssets):
+def run_multi_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs = builders.build_multi_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )

    core.run_test(
        hf_runner=hf_runner,
@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": len(image_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_embedding_test(*, model_test_info: VLMTestInfo,
-                       test_case: ExpandableVLMTestArgs,
-                       hf_runner: type[HfRunner],
-                       vllm_runner: type[VllmRunner],
-                       image_assets: ImageTestAssets):
+def run_embedding_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper)
+        model_test_info, image_assets, test_case.size_wrapper
+    )

    core.run_test(
        hf_runner=hf_runner,
@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
        limit_mm_per_prompt={"image": 1},
        vllm_embeddings=vllm_embeddings,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


 def run_video_test(
@ -90,8 +113,11 @@ def run_video_test(
    assert test_case.size_wrapper is not None
    assert test_case.num_video_frames is not None
    inputs = builders.build_video_inputs_from_test_info(
-        model_test_info, video_assets, test_case.size_wrapper,
-        test_case.num_video_frames)
+        model_test_info,
+        video_assets,
+        test_case.size_wrapper,
+        test_case.num_video_frames,
+    )

    core.run_test(
        hf_runner=hf_runner,
@ -103,7 +129,8 @@ def run_video_test(
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"video": len(video_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


 def run_audio_test(
@ -114,8 +141,7 @@ def run_audio_test(
    vllm_runner: type[VllmRunner],
    audio_assets: AudioTestAssets,
 ):
-    inputs = builders.build_audio_inputs_from_test_info(
-        model_test_info, audio_assets)
+    inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)

    core.run_test(
        hf_runner=hf_runner,
@ -127,13 +153,17 @@ def run_audio_test(
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"audio": 1},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
-                           test_case: ExpandableVLMTestArgs,
-                           hf_runner: type[HfRunner],
-                           vllm_runner: type[VllmRunner]):
+def run_custom_inputs_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+):
    # Custom test cases can provide inputs directly, but they need to
    # explicitly provided a CustomTestConfig, which wraps the inputs and
    # the limit_mm_per_prompt
@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt=limit_mm_per_prompt,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
+
 from collections.abc import Iterable
 from enum import Enum
 from pathlib import PosixPath
@ -15,9 +16,16 @@ from vllm.config import RunnerOption
 from vllm.logprobs import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer

-from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
-                           ImageTestAssets, PromptAudioInput, PromptImageInput,
-                           PromptVideoInput)
+from .....conftest import (
+    AUDIO_ASSETS,
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageAsset,
+    ImageTestAssets,
+    PromptAudioInput,
+    PromptImageInput,
+    PromptVideoInput,
+)
 from ....utils import check_logprobs_close

 # meta image tag; will be replaced by the appropriate tag for the model
@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]

 class PromptWithMultiModalInput(NamedTuple):
    """Holds the multimodal input for a single test case."""
+
    prompts: list[str]
    image_data: Optional[PromptImageInput] = None
    video_data: Optional[PromptVideoInput] = None
@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):

    # Function for converting ImageAssets to image embeddings;
    # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
-                                                    list[torch.Tensor]]] = None
+    convert_assets_to_embeddings: Optional[
+        Callable[[ImageTestAssets], list[torch.Tensor]]
+    ] = None

    # Exposed options for vLLM runner; we change these in a several tests,
    # but the defaults are derived from VllmRunner & the engine defaults
@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
    # for Qwen-VL, which requires encoding the image path / url into the prompt
    # for HF runner
    prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
-                 str]] = None  # noqa: E501
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
+    ] = None  # noqa: E501

    # Allows configuring a test to run with custom inputs
    custom_test_opts: Optional[list[CustomTestOptions]] = None
@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):

 class ExpandableVLMTestArgs(NamedTuple):
    """The expanded kwargs which correspond to a single test case."""
+
    model: str
    max_tokens: int
    num_logprobs: int
--- a/tests/models/multimodal/pooling/test_clip.py
+++ b/tests/models/multimodal/pooling/test_clip.py
@ -12,10 +12,12 @@ HF_TEXT_PROMPTS = [
    "a photo of a cherry blossom",
 ]

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign": "",
-    "cherry_blossom": "",
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "",
+        "cherry_blossom": "",
+    }
+)

 MODELS = ["openai/clip-vit-base-patch32"]

@ -33,11 +35,9 @@ def _run_test(
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     enforce_eager=True,
-                     max_model_len=77) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
@ -48,10 +48,12 @@ def _run_test(
            if "pixel_values" in inputs:
                inputs.pop("input_ids")
                pooled_output = hf_model.model.get_image_features(
-                    **hf_model.wrap_device(inputs)).squeeze(0)
+                    **hf_model.wrap_device(inputs)
+                ).squeeze(0)
            else:
                pooled_output = hf_model.model.get_text_features(
-                    **hf_model.wrap_device(inputs)).squeeze(0)
+                    **hf_model.wrap_device(inputs)
+                ).squeeze(0)

            all_outputs.append(pooled_output.tolist())

@ -98,8 +100,7 @@ def test_models_image(
    dtype: str,
 ) -> None:
    input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]
@ -125,11 +126,9 @@ def test_models_text_image_no_crash(
    texts = [HF_TEXT_PROMPTS[0]]
    images = [image_assets[0].pil_image]

-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     enforce_eager=True,
-                     max_model_len=77) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
+    ) as vllm_model:
        with pytest.raises(ValueError, match="not both"):
            vllm_model.embed(texts, images=images)

--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@ -17,18 +17,21 @@ HF_TEXT_PROMPTS = [
    # T -> X
    (
        "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
-        Image.new("RGB", (56, 56))),
+        Image.new("RGB", (56, 56)),
+    ),
    # T -> X
-    ("Query: Retrieve an image of this caption: cherry blossom",
-     Image.new("RGB", (56, 56))),
+    (
+        "Query: Retrieve an image of this caption: cherry blossom",
+        Image.new("RGB", (56, 56)),
+    ),
 ]

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What is shown in this image?",
-    "cherry_blossom":
-    "What is shown in this image?"
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "What is shown in this image?",
+        "cherry_blossom": "What is shown in this image?",
+    }
+)

 MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]

@ -36,34 +39,30 @@ MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
 def get_messages(image: Image.Image, text: str, embed_text: bool):
    # assert False, 'remember to use outer [] as required'
    if embed_text:
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": Image.new("RGB", (56, 56)),
-                    "resized_height": 1,
-                    "resized_width": 1
-                },  # need a dummy image here for an easier process.
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": Image.new("RGB", (56, 56)),
+                        "resized_height": 1,
+                        "resized_width": 1,
+                    },  # need a dummy image here for an easier process.
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
    else:
-        messages = [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image",
-                "image": image
-            }, {
-                "type": "text",
-                "text": text
-            }]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
    return messages


@ -71,8 +70,10 @@ def apply_chat_template_and_add_eos(
    messages: list[dict],
    apply_chat_template_fn: Callable,
 ):
-    prompt = apply_chat_template_fn(
-        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    prompt = (
+        apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
+        + "<|endoftext|>"
+    )
    return prompt


@ -86,16 +87,14 @@ def _run_test(
    *,
    dtype: str,
 ) -> None:
-    '''SET PYTHONPATH'''
+    """SET PYTHONPATH"""
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     enforce_eager=True,
-                     max_model_len=8192) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
+    ) as vllm_model:
        tokenizer = vllm_model.llm.get_tokenizer()
        texts = [
            # this is necessary because vllm_model.embed will not apply any
@ -105,25 +104,25 @@ def _run_test(
            apply_chat_template_and_add_eos(
                get_messages(image, text, False),
                apply_chat_template_fn=tokenizer.apply_chat_template,
-            ) for text, image in zip(input_texts, input_images)
+            )
+            for text, image in zip(input_texts, input_images)
            # vllm will replace the pad token with the actual image,
            # which may be a placeholder image, later.
        ]
        vllm_outputs = vllm_model.embed(texts, images=input_images)

    hf_outputs = []
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-
+    with hf_runner(
+        model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
+    ) as hf_model:
        prompts = []
-        for text, image, embed_text in zip(input_texts, input_images,
-                                           embed_texts):
+        for text, image, embed_text in zip(input_texts, input_images, embed_texts):
            # dse requires non-standard input processing
            # because it needs an image_pad token
            messages = get_messages(image, text, embed_text)
            prompt = apply_chat_template_and_add_eos(
-                messages, hf_model.processor.apply_chat_template)
+                messages, hf_model.processor.apply_chat_template
+            )

            prompts.append(prompt)

@ -145,9 +144,9 @@ def _run_test(
                    return_dict=True,
                    output_hidden_states=True,
                )
-                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
-                                            p=2,
-                                            dim=-1)
+                pooled_output = F.normalize(
+                    outputs.hidden_states[-1][0, -1], p=2, dim=-1
+                )

                all_outputs.append(pooled_output.tolist())

@ -170,8 +169,9 @@ def test_models_text(
    model: str,
    dtype: str,
 ) -> None:
-    input_texts_images = [(text, image_placeholder)
-                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts_images = [
+        (text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
+    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]
    embed_texts = [True] * len(input_texts)
@ -198,8 +198,7 @@ def test_models_image(
    dtype: str,
 ) -> None:
    input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@ -29,7 +29,7 @@ def run_intern_vit_test(
    img_processor = CLIPImageProcessor.from_pretrained(model)
    images = [asset.pil_image for asset in image_assets]
    pixel_values = [
-        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
+        img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
        for images in images
    ]

@ -37,15 +37,16 @@ def run_intern_vit_test(
    if not getattr(config, "norm_type", None):
        config.norm_type = "rms_norm"

-    hf_model = AutoModel.from_pretrained(model,
-                                         torch_dtype=torch_dtype,
-                                         trust_remote_code=True).to("cuda")
+    hf_model = AutoModel.from_pretrained(
+        model, torch_dtype=torch_dtype, trust_remote_code=True
+    ).to("cuda")
    hf_outputs_per_image = [
        hf_model(pixel_value.to("cuda")).last_hidden_state
        for pixel_value in pixel_values
    ]

    from vllm.model_executor.models.intern_vit import InternVisionModel
+
    vllm_model = InternVisionModel(config)
    vllm_model.load_weights(hf_model.state_dict().items())

@ -54,22 +55,23 @@ def run_intern_vit_test(

    vllm_model = vllm_model.to("cuda", torch_dtype)
    vllm_outputs_per_image = [
-        vllm_model(pixel_values=pixel_value.to("cuda"))
-        for pixel_value in pixel_values
+        vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
    ]
    del vllm_model
    cleanup_dist_env_and_memory()

    cos_similar = nn.CosineSimilarity(dim=-1)
-    for vllm_output, hf_output in zip(vllm_outputs_per_image,
-                                      hf_outputs_per_image):
+    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
        assert cos_similar(vllm_output, hf_output).mean() > 0.99


-@pytest.mark.parametrize("model_id", [
-    "OpenGVLab/InternViT-300M-448px",
-    "OpenGVLab/InternViT-6B-448px-V1-5",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "OpenGVLab/InternViT-300M-448px",
+        "OpenGVLab/InternViT-6B-448px-V1-5",
+    ],
+)
@pytest.mark.parametrize("dtype", ["half"])
 def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
    run_intern_vit_test(
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@ -29,7 +29,6 @@ def vllm_reranker(
    query_type: str = "text",
    doc_type: str = "text",
 ):
-
    def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
        return {"type": "image_url", "image_url": {"url": f"{url}"}}

@ -38,23 +37,25 @@ def vllm_reranker(
        query = query_strs
    elif query_type == "image":
        query = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in query_strs])
+            content=[create_image_param(url) for url in query_strs]
+        )

    documents: Union[list[str], ScoreMultiModalParam]
    if doc_type == "text":
        documents = document_strs
    elif doc_type == "image":
        documents = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in document_strs])
+            content=[create_image_param(url) for url in document_strs]
+        )

    with vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=dtype,
-            max_num_seqs=2,
-            max_model_len=2048,
-            mm_processor_kwargs=mm_processor_kwargs,
-            limit_mm_per_prompt=limit_mm_per_prompt,
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        max_num_seqs=2,
+        max_model_len=2048,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
    ) as vllm_model:
        outputs = vllm_model.llm.score(query, documents)

@ -78,16 +79,15 @@ def hf_reranker(
    data_pairs = [[query_strs[0], d] for d in document_strs]

    with hf_runner(
-            model_name,
-            dtype=dtype,
-            trust_remote_code=True,
-            auto_cls=AutoModel,
-            model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+        model_name,
+        dtype=dtype,
+        trust_remote_code=True,
+        auto_cls=AutoModel,
+        model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
    ) as hf_model:
-        return hf_model.model.compute_score(data_pairs,
-                                            max_length=2048,
-                                            query_type=query_type,
-                                            doc_type=doc_type)
+        return hf_model.model.compute_score(
+            data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
+        )


 # Visual Documents Reranking
@ -100,10 +100,12 @@ def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
    ]

-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "image"
+    )

    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -127,10 +129,12 @@ def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
        lower computational requirements.""",  # noqa: E501
        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
    ]
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "text"
+    )

    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -157,10 +161,12 @@ def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
    ]

-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "text"
+    )

    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@ -178,10 +184,12 @@ def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
    ]

-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "image"
+    )

    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@ -24,9 +24,10 @@ from ...utils import check_embeddings_close
 #    built with LAPACK support.
 pytestmark = pytest.mark.skipif(
    not current_platform.is_cuda(),
-    reason="Llava Next model uses op that is only supported in CUDA")
+    reason="Llava Next model uses op that is only supported in CUDA",
+)

-llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501

 HF_TEXT_PROMPTS = [
    # T -> X
@ -34,18 +35,21 @@ HF_TEXT_PROMPTS = [
        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
    ),
    # T -> X
-    llama3_template.format(
-        "cherry blossom\nSummary above sentence in one word: "),
+    llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
 ]

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # I -> X
-    "stop_sign":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-    # I -> X
-    "cherry_blossom":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # I -> X
+        "stop_sign": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+        # I -> X
+        "cherry_blossom": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+    }
+)

 MODELS = ["royokong/e5-v"]

@ -63,23 +67,22 @@ def _run_test(
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     runner="pooling",
-                     dtype=dtype,
-                     max_model_len=4096,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForImageTextToText) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForImageTextToText
+    ) as hf_model:
        # Patch the issue where generation_config.json is missing
-        hf_model.processor.patch_size = \
-            hf_model.model.config.vision_config.patch_size
+        hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size

        # Patch the issue where image_token_id
        # exceeds the maximum allowed vocab size
        hf_model.model.resize_token_embeddings(
-            hf_model.model.language_model.vocab_size + 1)
+            hf_model.model.language_model.vocab_size + 1
+        )

        all_inputs = hf_model.get_inputs(input_texts, images=input_images)

@ -91,8 +94,7 @@ def _run_test(
                return_dict=True,
                output_hidden_states=True,
            )
-            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
-                                        dim=-1)
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)

            all_outputs.append(pooled_output.tolist())

@ -142,8 +144,7 @@ def test_models_image(
    dtype: str,
 ) -> None:
    input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@ -19,14 +19,14 @@ HF_TEXT_PROMPTS = [
    "Retrieve an image of this caption: cherry blossom",
 ]

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # T + I -> X
-    "stop_sign":
-    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
-    # I -> X
-    "cherry_blossom":
-    "<|image_1|> Represent the given image for classification",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # T + I -> X
+        "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+        # I -> X
+        "cherry_blossom": "<|image_1|> Represent the given image for classification",  # noqa: E501
+    }
+)

 MODELS = ["TIGER-Lab/VLM2Vec-Full"]

@ -44,14 +44,14 @@ def _run_test(
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, runner="pooling", dtype=dtype,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, runner="pooling", dtype=dtype, enforce_eager=True
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
        all_inputs = hf_model.get_inputs(input_texts, images=input_images)

        all_outputs = []
@ -114,18 +114,21 @@ def test_models_image(
    dtype: str,
 ) -> None:
    input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    # add cases for special_tokens
-    input_texts_images.append((
-        "\n<s><|user|>\n <|image_1|>\n\t <s>"
-        "Represent the given image for classification<|end|>"
-        "\n<|assistant|>\n",
-        Image.open(
-            get_vllm_public_assets(filename="cherry_blossom.jpg",
-                                   s3_prefix=VLM_IMAGES_DIR)),
-    ))
+    input_texts_images.append(
+        (
+            "\n<s><|user|>\n <|image_1|>\n\t <s>"
+            "Represent the given image for classification<|end|>"
+            "\n<|assistant|>\n",
+            Image.open(
+                get_vllm_public_assets(
+                    filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
+                )
+            ),
+        )
+    )
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@ -19,25 +19,25 @@ def _run_test(
    vllm_runner: type[VllmRunner],
    model: str,
 ) -> None:
-
    prompt = [
        {
            # This model deals with no text input
            "prompt_token_ids": [1],
            "multi_modal_data": generate_test_mm_data(),
-        } for _ in range(10)
+        }
+        for _ in range(10)
    ]

    with vllm_runner(
-            model,
-            runner="pooling",
-            dtype="half",
-            enforce_eager=True,
-            skip_tokenizer_init=True,
-            # Limit the maximum number of sequences to avoid the
-            # test going OOM during the warmup run
-            max_num_seqs=32,
-            default_torch_num_threads=1,
+        model,
+        runner="pooling",
+        dtype="half",
+        enforce_eager=True,
+        skip_tokenizer_init=True,
+        # Limit the maximum number of sequences to avoid the
+        # test going OOM during the warmup run
+        max_num_seqs=32,
+        default_torch_num_threads=1,
    ) as vllm_model:
        vllm_model.encode(prompt)

--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@ -34,9 +34,9 @@ def run_radio_test(
    # Using `self.get_nearest_supported_resolution`, for assets 432x642 the
    # nearest supported resolution is 432x640.
    pixel_values = [
-        img_processor(
-            image,
-            return_tensors='pt').pixel_values.to(torch_dtype)[:, :, :, :640]
+        img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
+            :, :, :, :640
+        ]
        for image in images
    ]

@ -51,32 +51,33 @@ def run_radio_test(
    hf_model.eval()

    hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features
-        for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
    ]

-    radio_config = RadioConfig(model_name=config.args["model"],
-                               reg_tokens=config.args["register_multiple"])
+    radio_config = RadioConfig(
+        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+    )
    vllm_model = RadioModel(radio_config)
    vllm_model.load_weights(hf_model.state_dict())
    vllm_model = vllm_model.to("cuda", torch_dtype)

    vllm_outputs_per_image = [
-        vllm_model(pixel_values=pixel_value.to("cuda"))
-        for pixel_value in pixel_values
+        vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
    ]
    del vllm_model, hf_model
    cleanup_dist_env_and_memory()

    cos_similar = nn.CosineSimilarity(dim=-1)
-    for vllm_output, hf_output in zip(vllm_outputs_per_image,
-                                      hf_outputs_per_image):
+    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
        assert cos_similar(vllm_output, hf_output).mean() > 0.99


-@pytest.mark.parametrize("model_id", [
-    "nvidia/C-RADIOv2-H",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "nvidia/C-RADIOv2-H",
+    ],
+)
@pytest.mark.parametrize("dtype", ["half"])
 def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
    run_radio_test(
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -6,22 +6,27 @@ from typing import Optional, Union

 import numpy as np
 import pytest
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               cached_tokenizer_from_config,
-                                               encode_tokens)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+    encode_tokens,
+)

 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
        # GLM4.1V doesn't support multiple videos
        video = mm_data["video"]
        num_frames = len(video)
-        mm_data["video"] = (video, {
-            "total_num_frames": num_frames,
-            "fps": num_frames,
-            "duration": 1,
-            "frames_indices": [i for i in range(num_frames)],
-            "video_backend": "opencv",
-            "do_sample_frames": True,
-        })
+        mm_data["video"] = (
+            video,
+            {
+                "total_num_frames": num_frames,
+                "fps": num_frames,
+                "duration": 1,
+                "frames_indices": [i for i in range(num_frames)],
+                "video_backend": "opencv",
+                "do_sample_frames": True,
+            },
+        )
    return mm_data


@ -102,7 +110,8 @@ def _test_processing_correctness(
        mm_processor_cache_gb=2048,
        skip_tokenizer_init=model_info.skip_tokenizer_init,
        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )

    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@ -145,27 +154,22 @@ def _test_processing_correctness(
    input_to_hit = {
        "image": Image.new("RGB", size=(128, 128)),
        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
+        "audio": (np.zeros((512,)), 16000),
    }
    input_factory = {
-        "image":
-        partial(random_image, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(random_video,
-                rng,
-                min_frames=2,
-                max_frames=16,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+        "image": partial(random_image, rng, min_wh=128, max_wh=256),
+        "video": partial(
+            random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
+        ),
+        "audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
    }

    for batch_idx in range(num_batches):
        mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit + 1))]
+            k: [
+                (input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+                for _ in range(rng.randint(limit + 1))
+            ]
            for k, limit in limit_mm_per_prompt_ints.items()
        }

@ -174,12 +178,16 @@ def _test_processing_correctness(
        # Mistral chat outputs tokens directly, rather than text prompts
        if isinstance(tokenizer, MistralTokenizer):
            images = mm_data.get("image", [])
-            request = ChatCompletionRequest(messages=[
-                UserMessage(content=[
-                    TextChunk(text=""),
-                    *(ImageChunk(image=image) for image in images),
-                ]),
-            ])
+            request = ChatCompletionRequest(
+                messages=[
+                    UserMessage(
+                        content=[
+                            TextChunk(text=""),
+                            *(ImageChunk(image=image) for image in images),
+                        ]
+                    ),
+                ]
+            )
            res = tokenizer.mistral.encode_chat_completion(request)
            prompt = res.tokens
        else:
@ -303,16 +311,14 @@ def _test_processing_correctness_one(
            baseline_text_result,
            baseline_tokenized_result,
            ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
        )

        _assert_inputs_equal(
            cached_text_result,
            cached_tokenized_result,
            ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
        )


--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@ -24,7 +24,8 @@ from ...utils import build_model_context
        # post-sampled frames (expected behavior)
        (-1, 1, 5),
        (-1, 2, 10),
-    ])
+    ],
+)
 def test_processor_override(
    model_id: str,
    expected_toks_per_frame: int,
@ -55,10 +56,8 @@ def test_processor_override(
    # Ensure we have the right number of placeholders per num_crops size
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
-    video_tok_count = processed_inputs["prompt_token_ids"].count(
-        video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
-    )["video_grid_thw"][0]
+    video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]

    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
@ -71,7 +70,7 @@ def test_video_loader_consistency(
    fps: int,
 ):
    """
-    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    Ensure dynamic video loader (pre-sampled by loader) and normal video
    loader (post-sampled by processor) produce same video processing outputs.
    """
    ctx = build_model_context(
@ -91,7 +90,8 @@ def test_video_loader_consistency(

    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps)
+        video_bytes, fps=fps
+    )

    # pre-sampled loader shouldn't read all frames
    assert len(dynamic_video) < len(static_video)
@ -99,12 +99,11 @@ def test_video_loader_consistency(
    static_mm_data = {"video": [(static_video, static_metadata)]}
    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}

-    static_outputs = processor.apply(prompt, static_mm_data,
-                                     hf_processor_mm_kwargs)
-    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
-                                      hf_processor_mm_kwargs)
+    static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)

-    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
-        "prompt_token_ids"]
-    assert static_outputs["mm_kwargs"].get_data(
-    ) == dynamic_outputs["mm_kwargs"].get_data()
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
+    assert (
+        static_outputs["mm_kwargs"].get_data()
+        == dynamic_outputs["mm_kwargs"].get_data()
+    )
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@ -23,8 +24,10 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
-                                                  get_h2ovl_target_ratios)
+    from vllm.model_executor.models.h2ovl import (
+        calculate_h2ovl_targets,
+        get_h2ovl_target_ratios,
+    )

    width, height = image.size

@ -101,24 +104,27 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )

    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape

    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches


-@pytest.mark.parametrize("model_id", [
-    "h2oai/h2ovl-mississippi-800m",
-    "h2oai/h2ovl-mississippi-2b",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b",
+    ],
+)
@pytest.mark.parametrize(
    "size_factors",
    [
@ -165,10 +171,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import Idefics3Config

@ -17,7 +18,8 @@ from ...utils import build_model_context
    [
        ({"size": {"longest_edge": 364}}, 169),
        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -42,8 +44,11 @@ def test_processor_override(
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501

    # Build mm_data
@ -57,8 +62,7 @@ def test_processor_override(
    # Ensure the placeholders format are correct
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@ -24,7 +25,9 @@ def _get_expected_num_patches(
    max_num: int,
 ):
    from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )

    width, height = image.size

@ -61,15 +64,15 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )

    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape

    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches
@ -122,10 +125,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
 from ...utils import build_model_context


-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@ -38,13 +37,14 @@ def test_processor_override(
    hf_processor = processor.info.get_hf_processor()
    vocab = tokenizer.get_vocab()

-    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
-        + "<|image|>" * num_imgs \
+    prompt = (
+        "<|begin_of_text|><|header_start|>user<|header_end|>"
+        + "<|image|>" * num_imgs
        + "<|eot|><|header_start|>assistant<|header_end|>"
+    )
    mm_data = {
        "image": [
-            image_assets[(i % len(image_assets))].pil_image
-            for i in range(num_imgs)
+            image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
        ]
    }
    if tokenized_prompt:
@ -64,22 +64,23 @@ def test_processor_override(
        if tiles_x * tiles_y > 1:
            num_x_separators += (tiles_x - 1) * tiles_y
            num_y_separators += tiles_y
-    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
-        == num_x_separators
-    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
-        ==  num_y_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
+    assert (
+        prompt_token_ids.count(vocab[hf_processor.tile_global_token])
+        == num_y_separators
+    )

    # image token offsets
    img_locs = processed_inputs["mm_placeholders"].get("image", [])
    assert len(img_locs) == num_imgs
-    assert [img_loc.offset for img_loc in img_locs] == \
-        [i for i, v in enumerate(prompt_token_ids) \
-        if v == config.boi_token_index]
+    assert [img_loc.offset for img_loc in img_locs] == [
+        i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
+    ]

    # patch sizes and masks
-    num_patches_per_chunk = processor.info.get_patch_per_chunk(
-        config.vision_config)
-    assert prompt_token_ids.count(config.image_token_index) \
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
+    assert (
+        prompt_token_ids.count(config.image_token_index)
        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
-    assert len(mm_data["pixel_values"]) \
-        == sum(mm_data["patches_per_image"])
+    )
+    assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
    image_size: ImageSize,
 ) -> None:
    info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )

    try:
        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
        failed_size_excs.append((image_size, exc))


-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(

        # NOTE: There is a BOS token
        assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
-            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        assert (
+            first_placeholder.length
+            == (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        )

    except Exception as exc:
        failed_size_excs.append((image_size, exc))
@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )


-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
    image_size: ImageSize,
 ) -> None:
    info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )

    try:
        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
        failed_size_excs.append((image_size, exc))


-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
        model_id,
@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
        first_placeholder = image_placeholders[0]

        assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
-            processed_inputs["prompt_token_ids"]) // num_imgs
+        assert (
+            first_placeholder.length
+            == len(processed_inputs["prompt_token_ids"]) // num_imgs
+        )
    except Exception as exc:
        failed_size_excs.append((image_size, exc))

@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )


-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
    ctx = build_model_context(
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
    num_imgs: int,
    image_sizes: list[ImageSize],
 ) -> None:
-
    failed_size_excs = list[tuple[ImageSize, Exception]]()

    for size in image_sizes:
-        _validate_image_prompt_replacements_one(processor, num_imgs,
-                                                failed_size_excs, size)
+        _validate_image_prompt_replacements_one(
+            processor, num_imgs, failed_size_excs, size
+        )

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
+
 import pytest
 from torch import prod
 from transformers import Llama4Config
@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
    image_size = hf_config.vision_config.image_size
    patch_size = hf_config.vision_config.patch_size
    downsample_ratio = int(
-        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
-    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
+    )
+    tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
    chunks_per_image = prod(mm_data["patches_per_image"])
    total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y separator tokens
-    total_tokens = total_num_patches.item() + num_tiles.item(
-    ) + 3  # image start, image, image end
+    num_tiles = (
+        mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
+    )  # x-y separator tokens
+    total_tokens = (
+        total_num_patches.item() + num_tiles.item() + 3
+    )  # image start, image, image end

    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
        max_model_len,
@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):

    assert total_tokens == profiled_tokens["image"]
    assert total_tokens == sum(
-        placeholder.length for placeholder in
-        decoder_dummy_data.multi_modal_placeholders["image"])
+        placeholder.length
+        for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
+    )
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@ -24,7 +25,9 @@ def _get_expected_num_patches(
    max_num: int,
 ):
    from vllm.model_executor.models.nemotron_vl import (
-        calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
+        calculate_nemotron_vl_targets,
+        get_nemotron_vl_target_ratios,
+    )

    width, height = image.size

@ -63,22 +66,21 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
    print(total_expected_num_patches)
    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches


-@pytest.mark.parametrize("model_id",
-                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
    "size_factors",
    [
@ -125,10 +127,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
+
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY
@ -18,7 +19,8 @@ from ...utils import build_model_context
        ({"num_crops": 16}, 1921),
        # the default num_crops of phi-3.5-vision is 4
        ({}, 757),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
+
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY
@ -18,7 +19,8 @@ from ...utils import build_model_context
        ({"dynamic_hd": 16}, 4433),
        # the default num_crops of phi-4-multimodal is 36
        ({}, 9585),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -46,8 +48,7 @@ def test_processor_override(
    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"

-    image_size = ctx.get_hf_config(
-    ).embd_layer["image_embd_layer"]["crop_size"]
+    image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
    dummy_image_size = (image_size * 7, image_size * 7)
    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
    mm_data = {"image": [dummy_image] * num_imgs}
@ -56,5 +57,6 @@ def test_processor_override(

    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = processed_inputs["prompt_token_ids"].count(
-        _IMAGE_PLACEHOLDER_TOKEN_ID)
+        _IMAGE_PLACEHOLDER_TOKEN_ID
+    )
    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@ -12,10 +12,12 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
@pytest.mark.parametrize(
-    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
+    [
        ({}, 1426, (5704, 1176)),
        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -48,8 +50,7 @@ def test_processor_override(
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape

    assert img_tok_count == expected_toks_per_img * num_imgs
    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import SmolVLMConfig

@ -17,7 +18,8 @@ from ...utils import build_model_context
    [
        ({"max_image_size": {"longest_edge": 384}}, 1377),
        ({"max_image_size": {"longest_edge": 768}}, 405),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@ -42,8 +44,11 @@ def test_processor_override(
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501

    # Build mm_data
@ -57,8 +62,7 @@ def test_processor_override(
    # Ensure the placeholders format are correct
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -9,23 +9,29 @@ from typing import Any, Union
 import numpy as np
 import pytest
 import torch.nn as nn
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   supports_multimodal)
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    supports_multimodal,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
 }

 ImageInput = list[Image.Image]
-VideoInput = Union[list[Image.Image], list[np.ndarray],
-                   list[tuple[np.ndarray, dict[str, Any]]]]
+VideoInput = Union[
+    list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
+]
 AudioInput = list[tuple[np.ndarray, int]]


-def _resize_data(_data: Union[Image.Image, np.ndarray],
-                 size_factor: float) -> Union[Image.Image, np.ndarray]:
+def _resize_data(
+    _data: Union[Image.Image, np.ndarray], size_factor: float
+) -> Union[Image.Image, np.ndarray]:
    assert size_factor <= 1, "Size factor must be less than 1"
    # Image input
    if isinstance(_data, Image.Image):
@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
        return _data[..., :T, :H, :W, :C]
    # Audio input
    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
-        return _data[:int(len(_data) * size_factor)]
+        return _data[: int(len(_data) * size_factor)]
    raise AssertionError("This line should be unreachable.")


 def resize_mm_data(
-    data: Union[ImageInput, VideoInput, AudioInput],
-    size_factors: tuple[float,
-                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
-    size_factors = size_factors[:len(data)]
+    data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
+) -> Union[ImageInput, VideoInput, AudioInput]:
+    size_factors = size_factors[: len(data)]
    if is_list_of(data, (Image.Image, np.ndarray, list)):
        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
    elif is_list_of(data, tuple):
-        return [(_resize_data(d, s), meta)
-                for (d, meta), s in zip(data, size_factors)]
+        return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
    raise ValueError("Unsupported multimodal data type.")


@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
    # Mistral chat outputs tokens directly, rather than text prompts
    if model_config.tokenizer_mode == "mistral":
        images = resized_mm_data.get("image", [])
-        request = ChatCompletionRequest(messages=[
-            UserMessage(content=[
-                TextChunk(text=""),
-                *(ImageChunk(image=image) for image in images),
-            ]),
-        ])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=""),
+                        *(ImageChunk(image=image) for image in images),
+                    ]
+                ),
+            ]
+        )
        tokenizer = processing_info.get_tokenizer()
        res = tokenizer.mistral.encode_chat_completion(request)
        prompt = res.tokens
@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
        tokenization_kwargs=processor_inputs.tokenization_kwargs,
    )["mm_kwargs"].require_data()
-    items = [
-        item for modality in supported_mm_limits
-        for item in mm_kwargs[modality]
-    ]
+    items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
    return group_mm_kwargs_by_modality(
        items,
        merge_by_field_config=model_cls.merge_by_field_config,
@ -167,15 +174,17 @@ def initialize_dummy_model(
    cleanup_dist_env_and_memory()


-def get_model_id_to_test(
-        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
+def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
    filtered_results = []
    for model_arch in model_arch_list:
        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
            available_repos = list(
-                map(lambda model_id: (model_arch, model_id),
-                    [model_info.default, *model_info.extras.values()]))
+                map(
+                    lambda model_id: (model_arch, model_id),
+                    [model_info.default, *model_info.extras.values()],
+                )
+            )
            filtered_results.extend(available_repos)
        else:
            filtered_results.append((model_arch, model_info.default))
@ -183,8 +192,8 @@ def get_model_id_to_test(


@pytest.mark.parametrize(
-    "model_arch, model_id",
-    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
+    "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
+)
 def test_model_tensor_schema(model_arch: str, model_id: str):
    if model_arch in ARCH_TO_SKIP:
        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):

    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip",
-                                          check_max_version=False)
+    model_info.check_transformers_version(on_fail="skip", check_max_version=False)

-    hf_overrides_fn = partial(dummy_hf_overrides,
-                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides)
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+    )

    model_config = ModelConfig(
        model_id,
@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):

    with initialize_dummy_model(model_cls, model_config) as model:
        for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                model_cls, model_config, processor):
+            model_cls, model_config, processor
+        ):
            for method_name in inputs_parse_methods:
-                print(f"Testing `{method_name}` with modality={modality} "
-                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                print(
+                    f"Testing `{method_name}` with modality={modality} "
+                    f"and mm_kwargs{list(mm_kwargs.keys())}"
+                )
                getattr(model, method_name)(modality=modality, **mm_kwargs)
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@ -19,7 +19,7 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
    """Create weights from safetensors checkpoint metadata"""
    metadata = try_get_safetensors_metadata(repo)
    weight_names = list(metadata.weight_map.keys())
-    with torch.device('meta'):
+    with torch.device("meta"):
        return ((name, torch.empty(0)) for name in weight_names)


@ -61,7 +61,8 @@ def test_hf_model_weights_mapper(model_arch: str):
        hf_overrides=model_info.hf_overrides,
        skip_tokenizer_init=model_info.skip_tokenizer_init,
        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

    original_weights = create_repo_dummy_weights(model_id)
@ -83,6 +84,7 @@ def test_hf_model_weights_mapper(model_arch: str):

    weights_missing = ref_weight_names - weight_names
    weights_unmapped = weight_names - ref_weight_names
-    assert (not weights_missing and not weights_unmapped), (
+    assert not weights_missing and not weights_unmapped, (
        f"Following weights are not mapped correctly: {weights_unmapped}, "
-        f"Missing expected weights: {weights_missing}.")
+        f"Missing expected weights: {weights_missing}."
+    )
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@ -11,12 +11,12 @@ from vllm.multimodal.image import rescale_image_size
 from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
 from ..utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-    "cherry_blossom":
-    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        "cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    }
+)


 def run_awq_test(
@ -34,10 +34,13 @@ def run_awq_test(
 ):
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@ -46,42 +49,41 @@ def run_awq_test(

    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            source_model,
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        source_model,
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
    ) as vllm_model:
        source_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
            for prompts, images in inputs_per_image
        ]

    with vllm_runner(
-            quant_model,
-            quantization="awq",
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        quant_model,
+        quantization="awq",
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
    ) as vllm_model:
        quant_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
            for prompts, images in inputs_per_image
        ]

-    for source_outputs, quant_outputs in zip(source_outputs_per_image,
-                                             quant_outputs_per_image):
+    for source_outputs, quant_outputs in zip(
+        source_outputs_per_image, quant_outputs_per_image
+    ):
        # TODO: Check whether using original CLIPVisionModel can improve
        # consistency against HF
        check_logprobs_close(
@ -113,9 +115,16 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs) -> None:
-
+def test_awq_models(
+    vllm_runner,
+    image_assets,
+    source_model,
+    quant_model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
    run_awq_test(
        vllm_runner,
        image_assets,
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.

 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass

 import pytest
@ -24,8 +25,10 @@ class ModelPair:


 model_pairs = [
-    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
-              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+    ModelPair(
+        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
+    ),
 ]


@ -43,16 +46,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_bitblas,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''Tests whether bitsandbytes computation is enabled correctly.
+"""Tests whether bitsandbytes computation is enabled correctly.

 Run `pytest tests/quantization/test_bitsandbytes.py`.
-'''
+"""

 import pytest
 from transformers import BitsAndBytesConfig
@ -15,8 +15,10 @@ from ..utils import check_embeddings_close, check_logprobs_close

 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
-    ("mistralai/Mistral-7B-Instruct-v0.3",
-     "quantize inflight model with both HF and Mistral format weights")
+    (
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "quantize inflight model with both HF and Mistral format weights",
+    ),
 ]

 models_4bit_to_embedding_test = [
@ -28,72 +30,84 @@ models_4bit_to_moe_test = [
 ]

 models_pre_qaunt_4bit_to_test = [
-    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
-     'read pre-quantized 4-bit FP4 model'),
-    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
+    (
+        "PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
+        "read pre-quantized 4-bit FP4 model",
+    ),
+    ("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
 ]

 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
+    ("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, False, hf_model_kwargs)
+def test_load_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_qaunt_4bit_to_test)
-def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                       model_name, description) -> None:
-
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_quant_8bit_to_test)
-def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
-def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             example_prompts[:1],
-                             model_name,
-                             False,
-                             hf_model_kwargs,
-                             vllm_tp_size=2)
+def test_load_tp_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner,
+        vllm_runner,
+        example_prompts[:1],
+        model_name,
+        False,
+        hf_model_kwargs,
+        vllm_tp_size=2,
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@ -115,30 +129,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    compare_two_settings(model_name, common_args, pp_args)


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
-def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
-                            model_name, description) -> None:
+def test_4bit_bnb_moe_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    )
+    with vllm_runner(
+        model_name,
+        quantization="bitsandbytes",
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )

-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-    ))
-    with vllm_runner(model_name,
-                     quantization='bitsandbytes',
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
-                                                    max_tokens=32,
-                                                    num_logprobs=5)
-
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
        transformers_outputs = llm.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens=32, num_logprobs=5)
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
@ -147,10 +168,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_4bit_to_embedding_test)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
 def test_4bit_bnb_embedding_model(
    model_name,
@ -160,7 +182,6 @@ def test_4bit_bnb_embedding_model(
    example_prompts,
    dtype: str,
 ) -> None:
-
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@ -170,22 +191,23 @@ def test_4bit_bnb_embedding_model(
    example_prompts = [str(s).strip() for s in example_prompts]

    # Inflight 4bit quantization
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=dtype,
-                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes",
-                     default_torch_num_threads=1) as vllm_model:
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        gpu_memory_utilization=0.5,
+        quantization="bitsandbytes",
+        default_torch_num_threads=1,
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
    with hf_runner(
-            model_name,
-            dtype=dtype,
-            model_kwargs=hf_model_kwargs,
-            is_sentence_transformer=True,
-            default_torch_num_threads=1,
+        model_name,
+        dtype=dtype,
+        model_kwargs=hf_model_kwargs,
+        is_sentence_transformer=True,
+        default_torch_num_threads=1,
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

@ -210,23 +232,25 @@ def log_generated_texts(prompts, outputs, runner_name):
    return logged_texts


-def validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             prompts,
-                             model_name,
-                             pre_quant=False,
-                             hf_model_kwargs=None,
-                             vllm_tp_size=1,
-                             max_tokens=8):
-
+def validate_generated_texts(
+    hf_runner,
+    vllm_runner,
+    prompts,
+    model_name,
+    pre_quant=False,
+    hf_model_kwargs=None,
+    vllm_tp_size=1,
+    max_tokens=8,
+):
    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
-    with vllm_runner(model_name,
-                     quantization=None if pre_quant else 'bitsandbytes',
-                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-
+    with vllm_runner(
+        model_name,
+        quantization=None if pre_quant else "bitsandbytes",
+        tensor_parallel_size=vllm_tp_size,
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

@ -234,9 +258,9 @@ def validate_generated_texts(hf_runner,
        hf_model_kwargs = {}

    # Run with HF runner
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
        hf_outputs = llm.generate_greedy(prompts, max_tokens)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

@ -245,8 +269,10 @@ def validate_generated_texts(hf_runner,
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]
-        assert hf_str == vllm_str, (f"Model: {model_name}"
-                                    f"Mismatch between HF and vLLM outputs:\n"
-                                    f"Prompt: {prompt}\n"
-                                    f"HF Output: '{hf_str}'\n"
-                                    f"vLLM Output: '{vllm_str}'")
+        assert hf_str == vllm_str, (
+            f"Model: {model_name}"
+            f"Mismatch between HF and vLLM outputs:\n"
+            f"Prompt: {prompt}\n"
+            f"HF Output: '{hf_str}'\n"
+            f"vLLM Output: '{vllm_str}'"
+        )
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@ -5,6 +5,7 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
+
 import pytest

 from tests.quantization.utils import is_quant_method_supported
@ -14,21 +15,33 @@ from vllm.utils import STR_BACKEND_ENV_VAR
 from ..utils import check_logprobs_close


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize(
    "kv_cache_dtype,base_model,test_model",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
+        ),
        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct")
-    ])
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [True])
@ -54,38 +67,39 @@ def test_models(
    """

    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
-        pytest.skip(
-            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+        pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")

    if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
        pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")

    with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
        m.setenv(STR_BACKEND_ENV_VAR, backend)

        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8

        with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        check_logprobs_close(
            outputs_0_lst=baseline_outputs,
@ -96,15 +110,18 @@ def test_models(


@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(),
-                    reason="test for the CPU backend.")
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
@pytest.mark.parametrize(
    "kv_cache_dtype,base_model,test_model",
    [
        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
-    ])
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
 def test_cpu_models(
@ -121,28 +138,30 @@ def test_cpu_models(
    numerical sensitive kernels.
    """
    with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")

        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8

        with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        check_logprobs_close(
            outputs_0_lst=baseline_outputs,
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@ -100,35 +100,37 @@ def check_model_outputs(
 ):
    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
    if tokenizer.chat_template is not None:
-        messages = [[{
-            'role': 'user',
-            'content': prompt
-        }] for prompt in prompts]
-        prompts = tokenizer.apply_chat_template(messages,
-                                                tokenize=False,
-                                                add_generation_prompt=True)
+        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )

    # Run gguf model.
-    with vllm_runner(model_name=model.gguf_model,
-                     enforce_eager=True,
-                     tokenizer_name=model.original_model,
-                     dtype=dtype,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=tp_size) as gguf_model:
+    with vllm_runner(
+        model_name=model.gguf_model,
+        enforce_eager=True,
+        tokenizer_name=model.original_model,
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=tp_size,
+    ) as gguf_model:
        gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )

    # Run unquantized model.
    # Should run with tp=1, otherwise the test will stuck at
    # nccl initialization.
    with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=1) as original_model:
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as original_model:
        original_outputs = original_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=original_outputs,
@ -138,12 +140,14 @@ def check_model_outputs(
    )


-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", [
-    pytest.param(test_config, marks=test_config.marks)
-    for test_config in MODELS
-])
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
+)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@ -157,12 +161,15 @@ def test_models(
    num_logprobs: int,
    tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )


-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@ -178,5 +185,6 @@ def test_distributed(
    num_logprobs: int,
    tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.

 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass

 import pytest
@ -41,16 +42,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_gptq,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 """
+
 import os

 import pytest
@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
 MODELS = [
    # act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-
    # 8-bit, act_order==True, group_size=channelwise
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-
    # 4-bit, act_order==True, group_size=128
-    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
 ]


@pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@ -55,29 +56,34 @@ def test_models(
    model_name, revision = model

    # Run marlin.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype=dtype,
-                     quantization="marlin",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
-
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype=dtype,
+        quantization="marlin",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_marlin_model:
        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error

    # Run gptq.
    # The naive gptq kernel doesn't support bf16 yet.
    # Here we always compare fp16/bf16 gpt marlin kernel
    # to fp16 gptq kernel.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype="half",
-                     quantization="gptq",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype="half",
+        quantization="gptq",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 """
+
 from dataclasses import dataclass

 import pytest
@ -24,15 +25,18 @@ class ModelPair:

 model_pairs = [
    # 4-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
+    ),
    # # 4-bit, group_size == channelwise
    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
-
    # 8-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
+    ),
    # # 8-bit, group_size == channelwise
    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@ -40,10 +44,12 @@ model_pairs = [


@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin_24")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="Marlin24 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@ -56,16 +62,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="gptq_marlin_24") as marlin_24_model:
+    with vllm_runner(
+        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+    ) as marlin_24_model:
        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@ -5,6 +5,7 @@
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
+
 import os

 import pytest
@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.1-8B-Instruct-FP8": [
        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+        "**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
    ]
 }

@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    llm = LLM(
@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
-"""Tests Quark mxfp4 models against ground truth generation
-"""
+"""Tests Quark mxfp4 models against ground truth generation"""
+
 import pytest

 from vllm import LLM, SamplingParams
@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]

 EXPECTED_STRS_MAP = {
    "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
-        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
-        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
-        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
-        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
-        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
-        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        "\n### Key Features\n\n* **High-throughput Inference**: vLL",
+        "\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
+        "A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
+        "\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
+        "\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
+        "The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
        " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
    ]
 }
@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
        output_str = output.outputs[0].text
        expected_str = EXPECTED_STRS_MAP[model_name][i]
        assert expected_str == output_str, (
-            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
+        )
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@ -4,6 +4,7 @@
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
 """
+
 import os
 from typing import List

@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]

 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.3-70B-Instruct-FP4": [
-        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
-        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
-        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
+        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
+        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
    ]
 }

@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build "
-    " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
-                    reason="modelopt_fp4 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt_fp4"),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    llm = LLM(
@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -114,8 +114,10 @@ class _HfExamplesInfo:
        If the installed transformers version does not meet the requirements,
        perform the given action.
        """
-        if (self.min_transformers_version is None
-                and self.max_transformers_version is None):
+        if (
+            self.min_transformers_version is None
+            and self.max_transformers_version is None
+        ):
            return None

        current_version = TRANSFORMERS_VERSION
@ -125,11 +127,17 @@ class _HfExamplesInfo:
        msg = f"`transformers=={current_version}` installed, but `transformers"
        # Only check the base version for the min/max version, otherwise preview
        # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
-        if (check_min_version and min_version
-                and Version(cur_base_version) < Version(min_version)):
+        if (
+            check_min_version
+            and min_version
+            and Version(cur_base_version) < Version(min_version)
+        ):
            msg += f">={min_version}` is required to run this model."
-        elif (check_max_version and max_version
-              and Version(cur_base_version) > Version(max_version)):
+        elif (
+            check_max_version
+            and max_version
+            and Version(cur_base_version) > Version(max_version)
+        ):
            msg += f"<={max_version}` is required to run this model."
        else:
            return None
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -8,13 +8,19 @@ import pytest

 from vllm import LLM
 from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import (generate_scheduler_kv_cache_config,
-                                         get_kv_cache_configs)
+from vllm.v1.core.kv_cache_utils import (
+    generate_scheduler_kv_cache_config,
+    get_kv_cache_configs,
+)
 from vllm.v1.engine.core import EngineCore as V1EngineCore

 from ..utils import create_new_process_for_each_test
-from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
-                       HF_EXAMPLE_MODELS, HfExampleModels)
+from .registry import (
+    _TRANSFORMERS_BACKEND_MODELS,
+    AUTO_EXAMPLE_MODELS,
+    HF_EXAMPLE_MODELS,
+    HfExampleModels,
+)
 from .utils import dummy_hf_overrides

 # This minimal list of model architectures is smaller than the total list of
@ -24,23 +30,32 @@ from .utils import dummy_hf_overrides
 # generation, sequence classification, causal LM, ranking, chat, reward model,
 # multimodal, geospatial, voice, embedding, MTP)
 MINIMAL_MODEL_ARCH_LIST = [
-    "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
-    "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
-    "InternVLChatModel", "InternLM2ForRewardModel",
-    "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
-    "DeepSeekMTPModel", "XLMRobertaModel"
+    "LlavaForConditionalGeneration",
+    "Llama4ForConditionalGeneration",
+    "BertForSequenceClassification",
+    "Gemma3nForCausalLM",
+    "JinaVLForRanking",
+    "InternVLChatModel",
+    "InternLM2ForRewardModel",
+    "TransformersForMultimodalLM",
+    "PrithviGeoSpatialMAE",
+    "UltravoxModel",
+    "DeepSeekMTPModel",
+    "XLMRobertaModel",
 ]

 # This list is the complement of the minimal list above. The intention is that
 # this list of models is only tested in a "special case" i.e. most PRs should
 # not test these models
-OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
-                         set(MINIMAL_MODEL_ARCH_LIST))
+OTHER_MODEL_ARCH_LIST = set(HF_EXAMPLE_MODELS.get_supported_archs()) - set(
+    MINIMAL_MODEL_ARCH_LIST
+)


@create_new_process_for_each_test()
-def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
-                   EXAMPLE_MODELS: HfExampleModels):
+def can_initialize(
+    model_arch: str, monkeypatch: pytest.MonkeyPatch, EXAMPLE_MODELS: HfExampleModels
+):
    """The reason for using create_new_process_for_each_test is to avoid
    the WARNING:
        "We must use the 'spawn' multiprocessing start method. Overriding
@ -53,12 +68,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    hf_overrides_fn = partial(dummy_hf_overrides,
-                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides,
-                              use_original_num_layers=getattr(
-                                  model_info, 'use_original_num_layers',
-                                  False))
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+        use_original_num_layers=getattr(model_info, "use_original_num_layers", False),
+    )

    # Avoid calling model.forward()
    def _initialize_kv_caches_v1(self, vllm_config):
@ -68,14 +83,15 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            kv_cache_specs,
            [10 * GiB_bytes],
        )
-        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
-            kv_cache_configs)
+        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)

        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
        return 1, 0, scheduler_kv_cache_config

-    with (patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+    with (
+        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
+        monkeypatch.context() as m,
+    ):
        if model_info.v0_only:
            # NOTE(woosuk): skip the test for V0-only models
            return
@ -97,21 +113,24 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            speculative_config={
                "model": model_info.speculative_model,
                "num_speculative_tokens": 1,
-            } if model_info.speculative_model else None,
+            }
+            if model_info.speculative_model
+            else None,
            trust_remote_code=model_info.trust_remote_code,
            max_model_len=model_info.max_model_len,
            # these tests seem to produce leftover memory
            gpu_memory_utilization=0.80,
            load_format="dummy",
            model_impl="transformers"
-            if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
+            if model_arch in _TRANSFORMERS_BACKEND_MODELS
+            else "vllm",
            hf_overrides=hf_overrides_fn,
-            max_num_seqs=model_info.max_num_seqs)
+            max_num_seqs=model_info.max_num_seqs,
+        )


@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
-def test_can_initialize_small_subset(model_arch: str,
-                                     monkeypatch: pytest.MonkeyPatch):
+def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    """Test initializing small subset of supported models"""
    if model_arch == "Lfm2ForCausalLM":
        pytest.skip("Skipping until test supports V1-only models")
@ -119,10 +138,9 @@ def test_can_initialize_small_subset(model_arch: str,


@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
-def test_can_initialize_large_subset(model_arch: str,
-                                     monkeypatch: pytest.MonkeyPatch):
+def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    """Test initializing large subset of supported models
-    
+
    This test covers the complement of the tests covered in the "small subset"
    test.
    """
@ -131,8 +149,6 @@ def test_can_initialize_large_subset(model_arch: str,
    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)


-@pytest.mark.parametrize("model_arch",
-                         AUTO_EXAMPLE_MODELS.get_supported_archs())
-def test_implicit_converted_models(model_arch: str,
-                                   monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model_arch", AUTO_EXAMPLE_MODELS.get_supported_archs())
+def test_implicit_converted_models(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -50,9 +50,9 @@ def test_oot_registration_embedding(
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
-        llm = LLM(model=dummy_gemma2_embedding_path,
-                  load_format="dummy",
-                  max_model_len=2048)
+        llm = LLM(
+            model=dummy_gemma2_embedding_path, load_format="dummy", max_model_len=2048
+        )
        outputs = llm.embed(prompts)

        for output in outputs:
@ -69,27 +69,28 @@ def test_oot_registration_multimodal(
 ):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
-        prompts = [{
-            "prompt": "What's in the image?<image>",
-            "multi_modal_data": {
-                "image": image
+        prompts = [
+            {
+                "prompt": "What's in the image?<image>",
+                "multi_modal_data": {"image": image},
            },
-        }, {
-            "prompt": "Describe the image<image>",
-            "multi_modal_data": {
-                "image": image
+            {
+                "prompt": "Describe the image<image>",
+                "multi_modal_data": {"image": image},
            },
-        }]
+        ]

        sampling_params = SamplingParams(temperature=0)
-        llm = LLM(model=dummy_llava_path,
-                  load_format="dummy",
-                  max_num_seqs=1,
-                  trust_remote_code=True,
-                  gpu_memory_utilization=0.98,
-                  max_model_len=4096,
-                  enforce_eager=True,
-                  limit_mm_per_prompt={"image": 1})
+        llm = LLM(
+            model=dummy_llava_path,
+            load_format="dummy",
+            max_num_seqs=1,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.98,
+            max_model_len=4096,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image": 1},
+        )

        first_token = llm.get_tokenizer().decode(0)
        outputs = llm.generate(prompts, sampling_params)
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@ -6,16 +6,22 @@ import warnings
 import pytest
 import torch.cuda

-from vllm.model_executor.models import (is_pooling_model,
-                                        is_text_generation_model,
-                                        supports_multimodal)
-from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model,
-                                                 as_seq_cls_model)
-from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
-                                                 _SPECULATIVE_DECODING_MODELS,
-                                                 _TEXT_GENERATION_MODELS,
-                                                 ModelRegistry)
+from vllm.model_executor.models import (
+    is_pooling_model,
+    is_text_generation_model,
+    supports_multimodal,
+)
+from vllm.model_executor.models.adapters import (
+    as_embedding_model,
+    as_reward_model,
+    as_seq_cls_model,
+)
+from vllm.model_executor.models.registry import (
+    _MULTIMODAL_MODELS,
+    _SPECULATIVE_DECODING_MODELS,
+    _TEXT_GENERATION_MODELS,
+    ModelRegistry,
+)
 from vllm.platforms import current_platform

 from ..utils import create_new_process_for_each_test
@ -34,8 +40,7 @@ def test_registry_imports(model_arch):
    if model_arch in _SPECULATIVE_DECODING_MODELS:
        return  # Ignore these models which do not have a unified format

-    if (model_arch in _TEXT_GENERATION_MODELS
-            or model_arch in _MULTIMODAL_MODELS):
+    if model_arch in _TEXT_GENERATION_MODELS or model_arch in _MULTIMODAL_MODELS:
        assert is_text_generation_model(model_cls)

    # All vLLM models should be convertible to a pooling model
@ -48,13 +53,16 @@ def test_registry_imports(model_arch):


@create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
-    ("LlamaForCausalLM", False, False, False),
-    ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
-])
+@pytest.mark.parametrize(
+    "model_arch,is_mm,init_cuda,is_ce",
+    [
+        ("LlamaForCausalLM", False, False, False),
+        ("LlavaForConditionalGeneration", True, True, False),
+        ("BertForSequenceClassification", False, False, True),
+        ("RobertaForSequenceClassification", False, False, True),
+        ("XLMRobertaForSequenceClassification", False, False, True),
+    ],
+)
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
    assert model_info is not None
@ -70,7 +78,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
            warnings.warn(
                "This model no longer initializes CUDA on import. "
                "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )


@create_new_process_for_each_test()
@ -82,7 +91,8 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
        # ("MLPSpeculatorPreTrainedModel", False, False),
        ("DeepseekV2ForCausalLM", True, False),
        ("Qwen2VLForConditionalGeneration", True, True),
-    ])
+    ],
+)
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
    assert model_info is not None
@ -97,13 +107,16 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
            warnings.warn(
                "This model no longer initializes CUDA on import. "
                "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )


 def test_hf_registry_coverage():
-    untested_archs = (ModelRegistry.get_supported_archs() -
-                      HF_EXAMPLE_MODELS.get_supported_archs())
+    untested_archs = (
+        ModelRegistry.get_supported_archs() - HF_EXAMPLE_MODELS.get_supported_archs()
+    )

    assert not untested_archs, (
        "Please add the following architectures to "
-        f"`tests/models/registry.py`: {untested_archs}")
+        f"`tests/models/registry.py`: {untested_archs}"
+    )
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@ -11,32 +11,33 @@ from tests.conftest import VllmRunner
    "model",
    [
        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
-        "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
+        "mgazz/Prithvi_v2_eo_300_tl_unet_agb",
    ],
 )
 def test_inference(
    vllm_runner: type[VllmRunner],
    model: str,
 ) -> None:
-
    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
-    prompt = dict(prompt_token_ids=[1],
-                  multi_modal_data=dict(pixel_values=pixel_values,
-                                        location_coords=location_coords))
+    prompt = dict(
+        prompt_token_ids=[1],
+        multi_modal_data=dict(
+            pixel_values=pixel_values, location_coords=location_coords
+        ),
+    )
    with vllm_runner(
-            model,
-            runner="pooling",
-            dtype="half",
-            enforce_eager=True,
-            skip_tokenizer_init=True,
-            # Limit the maximum number of sequences to avoid the
-            # test going OOM during the warmup run
-            max_num_seqs=32,
-            default_torch_num_threads=1,
+        model,
+        runner="pooling",
+        dtype="half",
+        enforce_eager=True,
+        skip_tokenizer_init=True,
+        # Limit the maximum number of sequences to avoid the
+        # test going OOM during the warmup run
+        max_num_seqs=32,
+        default_torch_num_threads=1,
    ) as vllm_model:
-
        vllm_output = vllm_model.llm.encode(prompt)
        assert torch.equal(
-            torch.isnan(vllm_output[0].outputs.data).any(),
-            torch.tensor(False))
+            torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
+        )
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
+
 from typing import Any, Optional, Union

 import pytest
@ -60,14 +61,16 @@ def check_implementation(

@pytest.mark.skipif(
    current_platform.is_rocm(),
-    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
+    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
+)
@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
        ("allenai/OLMoE-1B-7B-0924", "transformers"),  # MoE
-    ])  # trust_remote_code=True by default
+    ],
+)  # trust_remote_code=True by default
 def test_models(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@ -77,29 +80,32 @@ def test_models(
 ) -> None:
    import transformers
    from packaging.version import Version
+
    installed = Version(transformers.__version__)
    required = Version("4.57.0.dev0")
    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
-        pytest.skip("MoE models with the Transformers backend require "
-                    f"transformers>={required}, but got {installed}")
+        pytest.skip(
+            "MoE models with the Transformers backend require "
+            f"transformers>={required}, but got {installed}"
+        )

-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         model,
-                         model_impl=model_impl)
+    check_implementation(
+        hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
+    )


 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
-    check_implementation(vllm_runner,
-                         vllm_runner,
-                         prompts,
-                         model="hmellor/tiny-random-Gemma2ForCausalLM",
-                         kwargs_ref=kwargs_ref,
-                         kwargs_test=kwargs_test)
+    check_implementation(
+        vllm_runner,
+        vllm_runner,
+        prompts,
+        model="hmellor/tiny-random-Gemma2ForCausalLM",
+        kwargs_ref=kwargs_ref,
+        kwargs_test=kwargs_test,
+    )


@multi_gpu_test(num_gpus=2)
@ -109,23 +115,28 @@ def test_distributed(
    example_prompts,
 ):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct",
-                         kwargs_test=kwargs)
-
-
-@pytest.mark.parametrize("model, quantization_kwargs", [
-    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
-    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
-    (
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
        "meta-llama/Llama-3.2-1B-Instruct",
-        {
-            "quantization": "bitsandbytes",
-        },
-    ),
-])
+        kwargs_test=kwargs,
+    )
+
+
+@pytest.mark.parametrize(
+    "model, quantization_kwargs",
+    [
+        ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
+        ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
+        (
+            "meta-llama/Llama-3.2-1B-Instruct",
+            {
+                "quantization": "bitsandbytes",
+            },
+        ),
+    ],
+)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_quantization(
@ -136,27 +147,34 @@ def test_quantization(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    if (current_platform.is_rocm()
-            and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
-        pytest.skip(
-            "bitsandbytes quantization is currently not supported in rocm.")
+    if (
+        current_platform.is_rocm()
+        and quantization_kwargs.get("quantization", "") == "bitsandbytes"
+    ):
+        pytest.skip("bitsandbytes quantization is currently not supported in rocm.")

    with vllm_runner(
-            model, model_impl="auto", enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model,
+        model_impl="auto",
+        enforce_eager=True,
+        **quantization_kwargs,  # type: ignore[arg-type]
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )

    with vllm_runner(
-            model,
-            model_impl="transformers",
-            enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model,
+        model_impl="transformers",
+        enforce_eager=True,
+        **quantization_kwargs,  # type: ignore[arg-type]
+    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

        transformers_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
@ -172,22 +190,24 @@ def test_quantization(
        # Layers live in `layers`
        "Qwen/Qwen3-Embedding-0.6B",
        # Layers live in `model.layers`
-        "meta-llama/Llama-3.2-1B-Instruct"
+        "meta-llama/Llama-3.2-1B-Instruct",
    ],
 )
 def test_embed_loading(vllm_runner, model):
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     enforce_eager=True,
-                     runner="pooling",
-                     model_impl="transformers") as model_test:
+    with vllm_runner(
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        runner="pooling",
+        model_impl="transformers",
+    ) as model_test:
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


@pytest.mark.parametrize(
-    "arch",
-    ["TransformersEmbeddingModel", "TransformersForSequenceClassification"])
+    "arch", ["TransformersEmbeddingModel", "TransformersForSequenceClassification"]
+)
 def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
    model = get_model(arch)

@ -202,6 +222,7 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
        hf_kwargs["is_sentence_transformer"] = True
    elif arch == "TransformersForSequenceClassification":
        from transformers import AutoModelForSequenceClassification
+
        hf_kwargs["auto_cls"] = AutoModelForSequenceClassification

    # The example_prompts has ending "\n", for example:
@ -212,8 +233,10 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
    # So we need to strip the input texts to avoid test failing.
    example_prompts = [str(s).strip() for s in example_prompts]

-    with (vllm_runner(model, **vllm_kwargs) as
-          vllm_model, hf_runner(model, **hf_kwargs) as hf_model):
+    with (
+        vllm_runner(model, **vllm_kwargs) as vllm_model,
+        hf_runner(model, **hf_kwargs) as hf_model,
+    ):
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@ -10,7 +10,6 @@ pytestmark = pytest.mark.cpu_test


 class ModuleWithBatchNorm(torch.nn.Module):
-
    def __init__(self):
        super().__init__()
        self.bn = torch.nn.BatchNorm1d(2)
@ -20,7 +19,6 @@ class ModuleWithBatchNorm(torch.nn.Module):


 class ModuleWithNestedBatchNorm(torch.nn.Module):
-
    def __init__(self):
        super().__init__()
        self.nested_mod = ModuleWithBatchNorm()
@ -67,9 +65,11 @@ def test_module_with_child_containing_batchnorm_can_autoload():
    new_mod = ModuleWithNestedBatchNorm()

    assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
    assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0

    loader = AutoWeightsLoader(new_mod)
@ -77,9 +77,9 @@ def test_module_with_child_containing_batchnorm_can_autoload():

    # Ensure the stats are updated
    assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1


@ -101,9 +101,11 @@ def test_module_skip_prefix():
    new_mod = ModuleWithNestedBatchNorm()

    assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
    assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0

    loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
@ -111,9 +113,9 @@ def test_module_skip_prefix():

    # Ensure the stats are updated
    assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1


@ -137,9 +139,11 @@ def test_module_skip_substr():
    new_mod = ModuleWithNestedBatchNorm()

    assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
    assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0

    loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
@ -147,7 +151,7 @@ def test_module_skip_substr():

    # Ensure the stats are updated
    assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@ -8,11 +8,16 @@ import torch.multiprocessing as mp

 from tests.utils import multi_gpu_test
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.models.vision import (
-    get_load_balance_assignment, resolve_visual_encoder_outputs,
-    run_dp_sharded_mrope_vision_model, run_dp_sharded_vision_model)
+    get_load_balance_assignment,
+    resolve_visual_encoder_outputs,
+    run_dp_sharded_mrope_vision_model,
+    run_dp_sharded_vision_model,
+)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables

@ -20,8 +25,7 @@ pytestmark = pytest.mark.cpu_test


@pytest.mark.parametrize(
-    ("select_layers", "num_layers_loaded", "max_possible_layers",
-     "expected_features"),
+    ("select_layers", "num_layers_loaded", "max_possible_layers", "expected_features"),
    [
        # All layers loaded
        ([1, 10], 10, 10, [1, 10]),
@ -29,16 +33,15 @@ pytestmark = pytest.mark.cpu_test
        # Some layers not loaded
        ([1, 10], 10, 20, [1, 10]),
        ([-20, -11], 10, 20, [1, 10]),
-    ])
-def test_resolve_visual_encoder_outputs(select_layers, num_layers_loaded,
-                                        max_possible_layers,
-                                        expected_features):
+    ],
+)
+def test_resolve_visual_encoder_outputs(
+    select_layers, num_layers_loaded, max_possible_layers, expected_features
+):
    """
    Test that offsets are correctly handled for vision feature layers.
    """
-    encoder_outputs = [
-        torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
-    ]
+    encoder_outputs = [torch.tensor([idx]) for idx in range(num_layers_loaded + 1)]
    output_tensor = resolve_visual_encoder_outputs(
        encoder_outputs=encoder_outputs,
        post_layer_norm=None,
@ -85,10 +88,11 @@ def test_run_dp_sharded_vision_model(batch_size: int):
    )


-def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
-                                          batch_size: int, master_port: int):
+def run_dp_sharded_vision_model_vs_direct(
+    local_rank: int, world_size: int, batch_size: int, master_port: int
+):
    """
-    Test that run_dp_sharded_vision_model produces the same results as 
+    Test that run_dp_sharded_vision_model produces the same results as
    calling the model directly.
    """

@ -99,13 +103,15 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
    current_platform.set_device(device)
    torch.set_default_device(device)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )

    # initialize distributed
    init_distributed_environment()
@ -141,28 +147,45 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
    [
        # Empty input
        ([], 2, [], [0, 0], [0, 0], "empty input"),
-
        # Fewer samples than GPUs
-        ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0
-                                               ], "fewer samples than GPUs"),
-
+        (
+            [100, 200],
+            4,
+            [1, 0],
+            [1, 1, 0, 0],
+            [200, 100, 0, 0],
+            "fewer samples than GPUs",
+        ),
        # Single GPU
        ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"),
-
        # Balanced assignment
-        ([100, 100, 100, 100
-          ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"),
-
+        (
+            [100, 100, 100, 100],
+            2,
+            [0, 2, 1, 3],
+            [2, 2],
+            [200, 200],
+            "balanced assignment",
+        ),
        # Unbalanced sizes - this one is trickier since the algorithm is greedy
-        ([1000, 100, 200, 50], 2, [0, 2, 1, 3
-                                   ], [1, 3], [1000, 350], "unbalanced sizes"),
+        (
+            [1000, 100, 200, 50],
+            2,
+            [0, 2, 1, 3],
+            [1, 3],
+            [1000, 350],
+            "unbalanced sizes",
+        ),
    ],
 )
-def test_get_load_balance_assignment_cases(sizes, num_gpus,
-                                           expected_shuffle_indices,
-                                           expected_gpu_sample_counts,
-                                           expected_grouped_sizes_per_gpu,
-                                           test_description):
+def test_get_load_balance_assignment_cases(
+    sizes,
+    num_gpus,
+    expected_shuffle_indices,
+    expected_gpu_sample_counts,
+    expected_grouped_sizes_per_gpu,
+    test_description,
+):
    """Test get_load_balance_assignment with various input cases."""
    result = get_load_balance_assignment(sizes, num_gpus=num_gpus)
    (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result
@ -188,8 +211,7 @@ class SimpleMRopeVisionModel(torch.nn.Module):
        self.out_hidden_size = out_hidden_size
        self.linear = torch.nn.Linear(768, out_hidden_size)

-    def forward(self, pixel_values: torch.Tensor,
-                grid_thw_list: list[list[int]]):
+    def forward(self, pixel_values: torch.Tensor, grid_thw_list: list[list[int]]):
        """Simple forward pass that simulates spatial merging."""
        # Apply linear transformation
        embeddings = self.linear(pixel_values)
@ -212,8 +234,9 @@ class SimpleMRopeVisionModel(torch.nn.Module):
            merged_patches = num_patches // merge_factor
            if merged_patches > 0:
                # Reshape and average to simulate merging
-                reshaped = image_patches[:merged_patches * merge_factor].view(
-                    merged_patches, merge_factor, -1)
+                reshaped = image_patches[: merged_patches * merge_factor].view(
+                    merged_patches, merge_factor, -1
+                )
                merged = reshaped.mean(dim=1)
                merged_embeddings.append(merged)

@ -222,9 +245,11 @@ class SimpleMRopeVisionModel(torch.nn.Module):
        if merged_embeddings:
            return torch.cat(merged_embeddings, dim=0)
        else:
-            return torch.empty((0, self.out_hidden_size),
-                               device=pixel_values.device,
-                               dtype=pixel_values.dtype)
+            return torch.empty(
+                (0, self.out_hidden_size),
+                device=pixel_values.device,
+                dtype=pixel_values.dtype,
+            )


@multi_gpu_test(num_gpus=2)
@ -250,12 +275,11 @@ def test_run_dp_sharded_mrope_vision_model(batch_size: int):
    )


-def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
-                                                world_size: int,
-                                                batch_size: int,
-                                                master_port: int):
+def run_dp_sharded_mrope_vision_model_vs_direct(
+    local_rank: int, world_size: int, batch_size: int, master_port: int
+):
    """
-    Test that run_dp_sharded_mrope_vision_model produces the same results as 
+    Test that run_dp_sharded_mrope_vision_model produces the same results as
    calling the model directly.
    """
    # Set random seed for reproducibility
@ -264,13 +288,15 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
    current_platform.set_device(device)
    torch.set_default_device(device)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )

    # initialize distributed
    init_distributed_environment()
@ -303,10 +329,9 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,

    # Run the model through the sharded function
    with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
-                                                           pixel_values,
-                                                           grid_thw_list,
-                                                           rope_type="rope_3d")
+        sharded_output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )
        sharded_output = torch.cat(sharded_output, dim=0)

    # Check that the world size is set up correctly
@ -317,10 +342,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
        # Check that the outputs have the same shape
        assert direct_output.shape == sharded_output.shape
        # Check that the outputs are close (they should be identical)
-        assert torch.allclose(direct_output,
-                              sharded_output,
-                              rtol=1e-5,
-                              atol=1e-5)
+        assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)


@multi_gpu_test(num_gpus=2)
@ -334,20 +356,23 @@ def test_run_dp_sharded_mrope_vision_model_empty_input():


 def run_dp_sharded_mrope_vision_model_empty_input_worker(
-        local_rank: int, world_size: int, master_port: int):
+    local_rank: int, world_size: int, master_port: int
+):
    """Test run_dp_sharded_mrope_vision_model with empty input."""
    # Set up distributed environment
    device = f"{current_platform.device_name}:{local_rank}"
    current_platform.set_device(device)
    torch.set_default_device(device)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )

    init_distributed_environment()
    initialize_model_parallel(tensor_model_parallel_size=world_size)
@ -360,10 +385,9 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(

    # Should handle empty input gracefully
    with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model,
-                                                   pixel_values,
-                                                   grid_thw_list,
-                                                   rope_type="rope_3d")
+        output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )

    assert len(output) == 0

@ -379,7 +403,8 @@ def test_run_dp_sharded_mrope_vision_model_uneven_load():


 def run_dp_sharded_mrope_vision_model_uneven_load_worker(
-        local_rank: int, world_size: int, master_port: int):
+    local_rank: int, world_size: int, master_port: int
+):
    """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
    # Set up distributed environment
    current_platform.seed_everything(123)
@ -387,13 +412,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
    current_platform.set_device(device)
    torch.set_default_device(device)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )

    init_distributed_environment()
    initialize_model_parallel(tensor_model_parallel_size=world_size)
@ -401,7 +428,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
    # Create images with very different sizes
    grid_thw_list = [
        [1, 2, 2],  # Small: 4 patches
-        [1, 8, 8],  # Large: 64 patches  
+        [1, 8, 8],  # Large: 64 patches
        [1, 3, 3],  # Medium: 9 patches
    ]

@ -416,15 +443,15 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(

    # Should handle uneven distribution without errors
    with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
+        output_tuple = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list, rope_type="rope_3d"
+        )

    # Verify output shape is reasonable
    merge_factor = vision_model.spatial_merge_size**2
    expected_output_patches = list(
-        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list)
+        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list
+    )

    for i, output in enumerate(output_tuple):
        assert output.shape[0] == expected_output_patches[i]
@ -445,8 +472,9 @@ def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int):
        pixel_values_list.append(image_pixels)

    pixel_values = torch.cat(pixel_values_list, dim=0)
-    vision_model = SimpleMRopeVisionModel(
-        spatial_merge_size=spatial_merge_size).to(device)
+    vision_model = SimpleMRopeVisionModel(spatial_merge_size=spatial_merge_size).to(
+        device
+    )

    with torch.inference_mode():
        output = vision_model(pixel_values, grid_thw_list)
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -33,16 +33,18 @@ def check_outputs_equal(
    """
    assert len(outputs_0_lst) == len(outputs_1_lst)

-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
        output_ids_0, output_str_0 = outputs_0
        output_ids_1, output_str_1 = outputs_1

        # The text and token outputs should exactly match
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\n{name_0}:\t{output_str_0!r}"
+            f"\n{name_1}:\t{output_str_1!r}"
+        )

        assert output_str_0 == output_str_1, fail_msg
        assert output_ids_0 == output_ids_1, fail_msg
@ -54,9 +56,9 @@ def check_outputs_equal(
 # * List of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
-                                                                    float]],
-                                                          SampleLogprobs]]]
+TokensTextLogprobs = tuple[
+    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]
+]

 # Allow for tokens to be represented as str's rather than IDs;
 # tuple of
@ -65,9 +67,9 @@ TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
 # * Optional list of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
-                                                        list[dict[str,
-                                                                  Logprob]]]]]
+TextTextLogprobs = tuple[
+    list[str], str, Optional[Union[list[dict[str, float]], list[dict[str, Logprob]]]]
+]

 # Representation of generated sequence as a tuple of
 # * Token ID list
@ -77,18 +79,21 @@ TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
 #
 # Allows prompt logprobs to be requested.
 TokensTextLogprobsPromptLogprobs = tuple[
-    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
-    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
+    list[int],
+    str,
+    Optional[Union[list[dict[int, float]], SampleLogprobs]],
+    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]],
+]


 def check_logprobs_close(
    *,
-    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
-    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
+    outputs_0_lst: Sequence[
+        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+    ],
+    outputs_1_lst: Sequence[
+        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+    ],
    name_0: str,
    name_1: str,
    num_outputs_0_skip_tokens: int = 0,
@ -128,9 +133,9 @@ def check_logprobs_close(
    assert len(outputs_0_lst) == len(outputs_1_lst)

    # Loop through responses to each prompt.
-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
        assert len(outputs_0) == len(outputs_1)
        if len(outputs_0) == 3:
            assert len(outputs_1) == 3
@ -155,17 +160,18 @@ def check_logprobs_close(
            ) = outputs_1

            # Test prompt logprobs closeness
-            if (prompt_logprobs_0 is not None
-                    and prompt_logprobs_1 is not None):
+            if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
                # Both sequences' prompt logprobs lists are not `None``
                # (although individual list elements may be `None`);
                # for each token's logprobs:
                for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
-                        zip(prompt_logprobs_0, prompt_logprobs_1)):
+                    zip(prompt_logprobs_0, prompt_logprobs_1)
+                ):
                    fail_msg = (
                        f"Prompt logprobs test:"
                        f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
-                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}"
+                    )

                    if logprobs_elem_0 is None:
                        # If the seq 0 token's logprobs are `None`,
@ -176,20 +182,24 @@ def check_logprobs_close(
                        # the seq 1 token's logprobs must not be `None`
                        assert logprobs_elem_1 is not None, fail_msg
                        # Logprobs check: top-k token choices must be the same
-                        assert (set(logprobs_elem_0.keys()) == set(
-                            logprobs_elem_1.keys())), fail_msg
+                        assert set(logprobs_elem_0.keys()) == set(
+                            logprobs_elem_1.keys()
+                        ), fail_msg
            else:
                # Both sequence logprobs lists must be `None`
-                fail_msg = (f"Prompt logprobs test:"
-                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
-                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+                fail_msg = (
+                    f"Prompt logprobs test:"
+                    f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
+                    f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}"
+                )

-                assert (prompt_logprobs_0 is None
-                        and prompt_logprobs_1 is None), fail_msg
+                assert prompt_logprobs_0 is None and prompt_logprobs_1 is None, fail_msg
        else:
-            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
-                             f"{len(outputs_0)} elements were provided: "
-                             f"{outputs_0}")
+            raise ValueError(
+                f"Outputs tuple must have 3 or 4 elements but "
+                f"{len(outputs_0)} elements were provided: "
+                f"{outputs_0}"
+            )

        if logprobs_0 is None:
            logprobs_0 = [None] * len(output_ids_0)
@ -206,9 +216,9 @@ def check_logprobs_close(
        logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]

        # Loop through generated tokens.
-        for idx, (output_id_0,
-                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
+        for idx, (output_id_0, output_id_1) in enumerate(
+            zip(output_ids_0, output_ids_1)
+        ):
            is_tok_mismatch = output_id_0 != output_id_1

            # If generated tokens don't match
@ -223,7 +233,8 @@ def check_logprobs_close(
                    f"Test{prompt_idx}:"
                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
-                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}"
+                )

                assert logprobs_elem_0 is not None, fail_msg
                assert logprobs_elem_1 is not None, fail_msg
@ -244,9 +255,11 @@ def check_logprobs_close(
            if output_str_0 != output_str_1 and warn_on_mismatch:
                # The token outputs exactly match,
                # so the text outputs should exactly match as well
-                fail_msg = (f"Test{prompt_idx}:"
-                            f"\n{name_0}:\t{output_str_0!r}"
-                            f"\n{name_1}:\t{output_str_1!r}")
+                fail_msg = (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}"
+                )

                with warnings.catch_warnings():
                    # This ensures that repeated warnings are shown
@ -317,18 +330,22 @@ def check_embeddings_close(
    assert len(embeddings_0_lst) == len(embeddings_1_lst)

    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
+        zip(embeddings_0_lst, embeddings_1_lst)
+    ):
        assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}"
+        )

-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
+        sim = F.cosine_similarity(
+            torch.tensor(embeddings_0), torch.tensor(embeddings_1), dim=0
+        )

-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\nCosine similarity: \t{sim:.4f}"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\nCosine similarity: \t{sim:.4f}"
+            f"\n{name_0}:\t{embeddings_0[:16]!r}"
+            f"\n{name_1}:\t{embeddings_1[:16]!r}"
+        )

        assert sim >= 1 - tol, fail_msg

@ -413,20 +430,19 @@ def dummy_hf_overrides(

    # Ensure at least 2 expert per group
    # Since `grouped_topk` assumes top-2
-    n_group = getattr(text_config, 'n_group', None)
+    n_group = getattr(text_config, "n_group", None)
    num_experts = n_group * 2 if n_group is not None else 2

    # we use three layers for Gemma-3n to check
    # both normal layer and kv_shared_layer
    if use_original_num_layers:
        # Use the original number of layers from the config
-        num_layers = getattr(text_config, 'num_layers', 1)
-        num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
+        num_layers = getattr(text_config, "num_layers", 1)
+        num_hidden_layers = getattr(text_config, "num_hidden_layers", 1)
    else:
        # Use minimal layers for testing
        num_layers = 1
-        num_hidden_layers = (3 if model_arch
-                             == "Gemma3nForConditionalGeneration" else 1)
+        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1

    update_dict = {
        "num_layers": num_layers,
@ -440,53 +456,63 @@ def dummy_hf_overrides(
    # Only set MoE related config when the model has MoE layers.
    # Otherwise all models detected as MoE by _get_transformers_backend_cls.
    if ModelConfig.get_num_experts(DummyConfig) > 0:
-        update_dict.update({
-            "num_experts": num_experts,
-            "num_experts_per_tok": 2,
-            "num_local_experts": num_experts,
-            # Otherwise there will not be any expert layers
-            "first_k_dense_replace": 0,
-            # To avoid OOM on DeepSeek-V3
-            "n_routed_experts": num_experts,
-        })
+        update_dict.update(
+            {
+                "num_experts": num_experts,
+                "num_experts_per_tok": 2,
+                "num_local_experts": num_experts,
+                # Otherwise there will not be any expert layers
+                "first_k_dense_replace": 0,
+                # To avoid OOM on DeepSeek-V3
+                "n_routed_experts": num_experts,
+            }
+        )

    # Update num_hidden_layers for non-Longcat architectures
-    if model_arch != "LongcatFlashForCausalLM" \
-            and model_arch != "LongCatFlashMTPModel":
+    if model_arch != "LongcatFlashForCausalLM" and model_arch != "LongCatFlashMTPModel":
        update_dict["num_hidden_layers"] = num_hidden_layers

    text_config.update(update_dict)

    if hasattr(hf_config, "vision_config"):
-        hf_config.vision_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
+        hf_config.vision_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            }
+        )

    # e.g.: ibm-granite/granite-speech-3.3-2b
    if hasattr(hf_config, "encoder_config"):
-        hf_config.encoder_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
+        hf_config.encoder_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            }
+        )

    # e.g.: Qwen/Qwen2-Audio-7B-Instruct
    if hasattr(hf_config, "audio_config"):
-        hf_config.audio_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-            "encoder_layers": 1,
-        })
+        hf_config.audio_config.update(
+            {
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+                "encoder_layers": 1,
+            }
+        )

    return hf_config


-def check_transformers_version(model: str,
-                               min_transformers_version: Optional[str] = None,
-                               max_transformers_version: Optional[str] = None):
+def check_transformers_version(
+    model: str,
+    min_transformers_version: Optional[str] = None,
+    max_transformers_version: Optional[str] = None,
+):
    from .registry import _HfExamplesInfo

-    return _HfExamplesInfo(model,
-                           min_transformers_version=min_transformers_version,
-                           max_transformers_version=max_transformers_version
-                           ).check_transformers_version(on_fail="skip")
+    return _HfExamplesInfo(
+        model,
+        min_transformers_version=min_transformers_version,
+        max_transformers_version=max_transformers_version,
+    ).check_transformers_version(on_fail="skip")