updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-15 13:50:42 +00:00
269 changed files with 7620 additions and 10749 deletions
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -6,17 +6,19 @@ set -exuo pipefail

 # Try building the docker image
 cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
-FROM gaudi-base-image:latest
+FROM 1.22-413-pt2.7.1:latest

 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm

+RUN pip install -v -r requirements/hpu.txt
+RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
-RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -70,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"

 echo "--- Hardware Information ---"
-# tpu-info
+tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -645,7 +645,7 @@ steps:
  optional: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s models/test_initialization.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -1,6 +0,0 @@
-# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
-have_fun: false  # Just review the code
-code_review:
-  comment_severity_threshold: HIGH  # Reduce quantity of comments
-  pull_request_opened:
-    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/compilation @zou3519 @youkaichao
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.32.0
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -30,11 +30,17 @@ from datasets import load_dataset
 from PIL import Image
 from transformers import PreTrainedTokenizerBase

-from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+try:
+    from vllm.lora.request import LoRARequest
+    from vllm.lora.utils import get_adapter_absolute_path
+    from vllm.multimodal import MultiModalDataDict
+    from vllm.multimodal.image import convert_image_mode
+    from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+except:
+    MultiModalDataDict = None
+    AnyTokenizer = None
+    LoRARequest = None
+    print("Install vLLM to use LoRA or Multimodal benchmarking.")

 logger = logging.getLogger(__name__)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -80,11 +80,6 @@ def bench_run(
        a, score, topk, renormalize=False
    )

-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
    def run_triton_moe(
        a: torch.Tensor,
        w1: torch.Tensor,
@ -116,10 +111,6 @@ def bench_run(
        w2: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        per_act_token: bool,
@ -134,10 +125,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -149,10 +136,6 @@ def bench_run(
        w2_q: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
@ -167,10 +150,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -215,10 +194,6 @@ def bench_run(
            w2_q,
            w1_scale,
            w2_scale,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
            topk_weights,
            topk_ids,
        )
@ -256,10 +231,6 @@ def bench_run(
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
        "per_act_token": per_act_token,
-        "ab_strides1": ab_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides1": c_strides1,
-        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -318,10 +289,6 @@ def bench_run(
        w2_q,
        w1_scale,
        w2_scale,
-        ab_strides1,
-        ab_strides2,
-        c_strides1,
-        c_strides2,
        topk_weights,
        topk_ids,
        per_act_token,
@ -330,7 +297,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -586,11 +586,6 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
-        E = config.num_experts
-        topk = config.moe_topk[0]
-        intermediate_size = config.moe_intermediate_size[0]
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Support for llama4
        config = config.get_text_config()
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -24,7 +24,6 @@

 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
-#include "cuda_compat.h"

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -34,6 +33,12 @@ typedef __hip_bfloat16 __nv_bfloat16;
  #include "../quantization/fp8/nvidia/quant_utils.cuh"
 #endif

+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@ -665,6 +670,7 @@ __global__ void paged_attention_v2_reduce_kernel(

 }  // namespace vllm

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -18,7 +18,6 @@ limitations under the License.
 * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
 * by Alcanderian JieXin Liang
 */
-#include "core/registration.h"

 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
@ -271,13 +270,4 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
 }

 #endif
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
-  m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
-}
-
 // clang-format on
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@ -18,7 +18,12 @@
 */

 #include "attention_kernels.cuh"
-#include "cuda_compat.h"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -182,6 +187,7 @@ void paged_attention_v1(
                             CALL_V1_LAUNCHER_BLOCK_SIZE)
 }

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@ -18,7 +18,12 @@
 */

 #include "attention_kernels.cuh"
-#include "cuda_compat.h"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -192,6 +197,7 @@ void paged_attention_v2(
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
 }

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/cpu/sgl-kernels/common.h
+++ b/csrc/cpu/sgl-kernels/common.h
@ -58,7 +58,7 @@ namespace {

 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_LAST_DIM_CONTIGUOUS(x) \
-  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")

 #define CHECK_INPUT(x) \
  CHECK_CPU(x);        \
--- a/csrc/cpu/sgl-kernels/gemm.h
+++ b/csrc/cpu/sgl-kernels/gemm.h
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
    int64_t topk,
    int64_t num_tokens_post_pad);

-// shared expert implementation for int8 w8a8
+// shared expert implememntation for int8 w8a8
 template <typename scalar_t>
 void shared_expert_int8_kernel_impl(
    scalar_t* __restrict__ output,
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
    __m512  vd0;
    __m512  vd1[COLS];

-    // oops! 4x4 spills but luckily we use 4x2
+    // oops! 4x4 spills but luckly we use 4x2
    __m512 vbias[COLS];

    // [NOTE]: s8s8 igemm compensation in avx512-vnni
--- a/csrc/cpu/sgl-kernels/vec.h
+++ b/csrc/cpu/sgl-kernels/vec.h
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
 #define CVT_FP16_TO_FP32(a) \
    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))

-// this doesn't handle NaN.
+// this doesn't hanel NaN.
 inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);

--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@ -4,10 +4,10 @@
  #include <hip/hip_runtime.h>
 #endif

-#if defined(USE_ROCM) && defined(__GFX9__)
-  #define WARP_SIZE 64
-#else
+#ifndef USE_ROCM
  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
 #endif

 #ifndef USE_ROCM
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -160,30 +160,6 @@ __global__ void shuffleInputRowsKernel(const T* input,
  }
 }

-template <typename T>
-__global__ void shuffleInputRowsKernelSlow(const T* input,
-                                           const int32_t* dst2src_map,
-                                           T* output, int64_t num_src_rows,
-                                           int64_t num_dst_rows,
-                                           int64_t num_cols) {
-  int64_t dest_row_idx = blockIdx.x;
-  int64_t const source_row_idx = dst2src_map[dest_row_idx];
-
-  if (blockIdx.x < num_dst_rows) {
-    // Duplicate and permute rows
-    auto const* source_row_ptr = input + source_row_idx * num_cols;
-    auto* dest_row_ptr = output + dest_row_idx * num_cols;
-
-    int64_t const start_offset = threadIdx.x;
-    int64_t const stride = blockDim.x;
-
-    for (int elem_index = start_offset; elem_index < num_cols;
-         elem_index += stride) {
-      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
-    }
-  }
-}
-
 void shuffle_rows(const torch::Tensor& input_tensor,
                  const torch::Tensor& dst2src_map,
                  torch::Tensor& output_tensor) {
@ -197,24 +173,17 @@ void shuffle_rows(const torch::Tensor& input_tensor,
  int64_t const num_src_rows = input_tensor.size(0);
  int64_t const num_cols = input_tensor.size(1);

-  if (num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)) {
-    // use slow kernel if num_cols can't be aligned to 128 bits
-    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
-      shuffleInputRowsKernelSlow<scalar_t><<<blocks, threads, 0, stream>>>(
-          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
-          dst2src_map.data_ptr<int32_t>(),
-          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
-          num_dest_rows, num_cols);
-    });
-  } else {
-    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
-      shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
-          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
-          dst2src_map.data_ptr<int32_t>(),
-          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
-          num_dest_rows, num_cols);
-    });
-  }
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
 }

 #else
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -167,6 +167,19 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                        torch::Tensor const& seq_lens,
                        torch::Tensor const& page_table, double scale);

+void sm100_cutlass_mla_decode(
+    torch::Tensor const& out, torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens, torch::Tensor const& page_table,
+    torch::Tensor const& workspace, double sm_scale,
+    int64_t num_kv_splits =
+        1 /* Set to 1 to avoid cuda_graph issue by default. */);
+
+int64_t sm100_cutlass_mla_get_workspace_size(
+    int64_t max_seq_len, int64_t num_batches, int64_t sm_count = 0,
+    int64_t num_kv_splits =
+        1 /* Set to 1 to avoid cuda_graph issue by default. */);
+
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);

 #ifndef USE_ROCM
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@ -29,36 +29,19 @@ struct sm90_fp8_config_default {

 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M4 {
-  // M in [1, 4]
+struct sm90_fp8_config_M16 {
+  // M in [1, 16]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
  using EpilogueSchedule =
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;

  using Cutlass3xGemm =
      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule, true>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in (4, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule =
-      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
-  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule, true>;
+                            KernelSchedule, EpilogueSchedule>;
 };

 template <typename InType, typename OutType,
@ -119,9 +102,7 @@ void run_cutlass_moe_mm_sm90(
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
-      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
+  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
@ -130,18 +111,7 @@ void run_cutlass_moe_mm_sm90(
  uint32_t const n = out_tensors.size(1);
  uint32_t const k = a_tensors.size(1);

-  // Use swap_ab for M <= 64 by default to reduce padding
-  if (m <= 4) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else if (m <= 64) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else if (n >= 8192) {
+  if (n >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
@ -151,6 +121,11 @@ void run_cutlass_moe_mm_sm90(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
        per_out_ch);
+  } else if (m <= 16) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else {
    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@ -22,23 +22,14 @@ using ArchTag = cutlass::arch::Sm90;
 using OperatorClass = cutlass::arch::OpClassTensorOp;

 using LayoutA = cutlass::layout::RowMajor;
-using LayoutA_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
 using LayoutB = cutlass::layout::ColumnMajor;
-using LayoutB_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
-using LayoutD = cutlass::layout::RowMajor;
-using LayoutD_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutD>::type;
-using LayoutC = LayoutD;
-using LayoutC_Transpose = LayoutD_Transpose;
+using LayoutC = cutlass::layout::RowMajor;

 template <typename ElementAB_, typename ElementC_,
          template <typename, typename, typename> typename Epilogue_,
          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule, bool swap_ab_ = false>
+          typename EpilogueSchedule>
 struct cutlass_3x_group_gemm {
-  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = ElementAB_;
  using ElementC = void;
  using ElementD = ElementC_;
@ -46,6 +37,9 @@ struct cutlass_3x_group_gemm {

  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;

+  using StrideC =
+      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
+
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
@ -56,26 +50,19 @@ struct cutlass_3x_group_gemm {
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, TileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, ElementC,
-          conditional_t<swap_ab, LayoutC_Transpose*, LayoutC*>, AlignmentC,
-          ElementD, conditional_t<swap_ab, LayoutD_Transpose*, LayoutD*>,
-          AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;

  static constexpr size_t CEStorageSize =
      sizeof(typename CollectiveEpilogue::SharedStorage);
  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(CEStorageSize)>;

-  using CollectiveMainloop = conditional_t<
-      swap_ab,
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementAB, LayoutB_Transpose*, AlignmentAB,
-          ElementAB, LayoutA_Transpose*, AlignmentAB, ElementAccumulator,
-          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp,
+  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
-          Stages, KernelSchedule>::CollectiveOp>;
+          Stages, KernelSchedule>::CollectiveOp;

  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
@ -91,12 +78,12 @@ void cutlass_group_gemm_caller(
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch) {
-  static constexpr bool swap_ab = Gemm::swap_ab;
-
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

  int num_experts = static_cast<int>(expert_offsets.size(0));
+  int k_size = a_tensors.size(1);
+  int n_size = out_tensors.size(1);

  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

@ -123,35 +110,19 @@ void cutlass_group_gemm_caller(
          problem_sizes.data_ptr());
  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};

-  typename GemmKernel::MainloopArguments mainloop_args;
-  if constexpr (swap_ab) {
-    mainloop_args = typename GemmKernel::MainloopArguments{
-        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
-        static_cast<StrideB*>(b_strides.data_ptr()),
-        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
-        static_cast<StrideA*>(a_strides.data_ptr())};
-  } else {
-    mainloop_args = typename GemmKernel::MainloopArguments{
-        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
-        static_cast<StrideA*>(a_strides.data_ptr()),
-        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
-        static_cast<StrideB*>(b_strides.data_ptr())};
-  }
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides.data_ptr()),
+      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides.data_ptr())};

  // Currently, we are only able to do broadcast on either all or none a_scales
  // and on either all or none b_scales
  typename GemmKernel::EpilogueArguments epilogue_args{
      Gemm::Epilogue::prepare_args(
-          swap_ab ? static_cast<const ElementAccumulator**>(
-                        b_scales_ptrs.data_ptr())
-                  : static_cast<const ElementAccumulator**>(
-                        a_scales_ptrs.data_ptr()),
-          swap_ab ? static_cast<const ElementAccumulator**>(
-                        a_scales_ptrs.data_ptr())
-                  : static_cast<const ElementAccumulator**>(
-                        b_scales_ptrs.data_ptr()),
-          swap_ab ? per_out_ch : per_act_token,
-          swap_ab ? per_act_token : per_out_ch),
+          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+          per_act_token, per_out_ch),
      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(c_strides.data_ptr())};
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@ -6,10 +6,7 @@
 #include <iostream>

 constexpr uint64_t THREADS_PER_EXPERT = 512;
-// threshold must match the dispatch logic in run_cutlass_moe_mm_sm90()
-constexpr int SWAP_AB_THRESHOLD = 64;

-template <bool SWAP_AB>
 __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
                                      int32_t* problem_sizes1,
                                      int32_t* problem_sizes2,
@ -27,53 +24,40 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,

  if (threadIdx.x == 0) {
    int final_occurrences = atomic_buffer[expert_id];
-    if constexpr (!SWAP_AB) {
-      problem_sizes1[expert_id * 3] = final_occurrences;
-      problem_sizes1[expert_id * 3 + 1] = 2 * n;
-      problem_sizes1[expert_id * 3 + 2] = k;
-      problem_sizes2[expert_id * 3] = final_occurrences;
-      problem_sizes2[expert_id * 3 + 1] = k;
-      problem_sizes2[expert_id * 3 + 2] = n;
-    } else {
-      problem_sizes1[expert_id * 3] = 2 * n;
-      problem_sizes1[expert_id * 3 + 1] = final_occurrences;
-      problem_sizes1[expert_id * 3 + 2] = k;
-      problem_sizes2[expert_id * 3] = k;
-      problem_sizes2[expert_id * 3 + 1] = final_occurrences;
-      problem_sizes2[expert_id * 3 + 2] = n;
-    }
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = 2 * n;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = k;
+    problem_sizes2[expert_id * 3 + 2] = n;
  }
 }

 __global__ void compute_expert_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* atomic_buffer, const int num_experts, const int topk_length) {
+    int32_t* atomic_buffer, const int num_experts) {
  int32_t tot_offset = 0;
  expert_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
    atomic_buffer[i] = tot_offset;
-    tot_offset += topk_length > SWAP_AB_THRESHOLD ? problem_sizes1[i * 3]
-                                                  : problem_sizes1[i * 3 + 1];
+    tot_offset += problem_sizes1[i * 3];
    expert_offsets[i + 1] = tot_offset;
  }
 }

 __global__ void compute_expert_blockscale_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
-    const int topk_length) {
+    int32_t* blockscale_offsets, int32_t* atomic_buffer,
+    const int num_experts) {
  int32_t tot_offset = 0;
  int32_t tot_offset_round = 0;
  expert_offsets[0] = 0;
  blockscale_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
-    int32_t cur_offset = topk_length > SWAP_AB_THRESHOLD
-                             ? problem_sizes1[i * 3]
-                             : problem_sizes1[i * 3 + 1];
    atomic_buffer[i] = tot_offset;
-    tot_offset += cur_offset;
+    tot_offset += problem_sizes1[i * 3];
    expert_offsets[i + 1] = tot_offset;
-    tot_offset_round += (cur_offset + (128 - 1)) / 128 * 128;
+    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
    blockscale_offsets[i + 1] = tot_offset_round;
  }
 }
@ -118,36 +102,22 @@ void get_cutlass_moe_mm_data_caller(
  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);

  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
-
-  if (topk_ids.numel() > SWAP_AB_THRESHOLD) {
-    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  } else {
-    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  }
-
+  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
  if (blockscale_offsets.has_value()) {
    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
-        topk_ids.numel());
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  } else {
    compute_expert_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
-        topk_ids.numel());
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  }
  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
      static_cast<const int32_t*>(topk_ids.data_ptr()),
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -20,17 +20,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
  //

-  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
-  // so we need
+  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
  // to override this for many GEMMs with the following tag. Otherwise,
  // torch.compile will force all input tensors to be contiguous(), which
  // will break many custom ops that require column-major weight matrices.
-  // This was a bug and PyTorch 2.7 has since fixed this.
-#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
-  #define stride_tag at::Tag::needs_fixed_stride_order
-#else
-  #define stride_tag
-#endif
+  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
+  // to match exact eager-mode strides.
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;

  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@ -525,14 +521,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                         Tensor page_table, Tensor workspace, float "
      "scale,"
      "                         int num_kv_splits) -> ()");
-  // conditionally compiled so impl in source file
+  ops.impl("sm100_cutlass_mla_decode", torch::kCUDA, &sm100_cutlass_mla_decode);

  // SM100 CUTLASS MLA workspace
  ops.def(
      "sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
      "                                     int sm_count, int num_kv_splits) "
      "-> int");
-  // conditionally compiled so impl in source file
+  ops.impl("sm100_cutlass_mla_get_workspace_size",
+           &sm100_cutlass_mla_get_workspace_size);

  // Compute NVFP4 block quantized tensor.
  ops.def(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}

-# Flag enables built-in KV-connector dependency libs into docker images
+# Flag enables build-in KV-connector dependency libs into docker images
 ARG INSTALL_KV_CONNECTORS=false

 #################### BASE BUILD IMAGE ####################
@ -207,19 +207,6 @@ ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
-
-# Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED
-# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
-ENV VLLM_USE_PRECOMPILED=""
-RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
-        export VLLM_USE_PRECOMPILED=1 && \
-        echo "Using precompiled wheels"; \
-    else \
-        unset VLLM_USE_PRECOMPILED && \
-        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
-    fi
-
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
@ -388,33 +375,48 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl

-# Install FlashInfer from source
+# Allow specifying a version, Git revision or local .whl file
+ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
+ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.8rc1"
+# Flag to control whether to use pre-built FlashInfer wheels (set to false to force build from source)
+# TODO: Currently disabled because the pre-built wheels are not available for FLASHINFER_GIT_REF
+ARG USE_FLASHINFER_PREBUILT_WHEEL=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer
-    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-    else
-        # CUDA 12.8+ supports 10.0a and 12.0
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-    fi
-    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-    # Needed to build AOT kernels
-    pushd flashinfer
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            python3 -m flashinfer.aot
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation .
-    popd
-    rm -rf flashinfer
+  if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
+      # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
+      if [[ "$CUDA_VERSION" == 12.8* ]] && [[ "$USE_FLASHINFER_PREBUILT_WHEEL" == "true" ]]; then
+          uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
+      else
+          # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+          # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+          if [[ "${CUDA_VERSION}" == 11.* ]]; then
+              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+          elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+          else
+              # CUDA 12.8+ supports 10.0a and 12.0
+              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+          fi
+          echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+
+          git clone --depth 1 --recursive --shallow-submodules \
+            --branch ${FLASHINFER_GIT_REF} \
+            ${FLASHINFER_GIT_REPO} flashinfer
+
+          # Needed to build AOT kernels
+          pushd flashinfer
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+              python3 -m flashinfer.aot
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+              uv pip install --system --no-build-isolation .
+          popd
+
+          rm -rf flashinfer
+      fi \
+  fi
 BASH
 COPY examples examples
 COPY benchmarks benchmarks
@ -506,11 +508,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        uv pip install --system -r requirements/kv_connectors.txt; \
    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
-    fi; \
-    uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+    fi

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -95,7 +95,7 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
-    sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
+    sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -0,0 +1,21 @@
+FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements/hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -1,5 +1,5 @@
-ARG NIGHTLY_DATE="20250714"
-ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
+ARG NIGHTLY_DATE="20250124"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
 WORKDIR /workspace/vllm
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -8,6 +8,7 @@ API documentation for vLLM's configuration classes.

 - [vllm.config.ModelConfig][]
 - [vllm.config.CacheConfig][]
+- [vllm.config.TokenizerPoolConfig][]
 - [vllm.config.LoadConfig][]
 - [vllm.config.ParallelConfig][]
 - [vllm.config.SchedulerConfig][]
--- a/docs/assets/deployment/open_webui.png
+++ b/docs/assets/deployment/open_webui.png
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@ -1,7 +1,3 @@
---
-toc_depth: 4
---
-
 # vLLM CLI Guide

 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -46,10 +42,6 @@ Start the vLLM OpenAI Compatible API server.
    vllm serve --help=page
    ```

-### Options
-
--8<-- "docs/argparse/serve.md"
-
 ## chat

 Generate chat completions via the running API server.
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments

 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+To see the available CLI arguments, run `vllm serve --help`!

 ## Configuration file

--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@ -3,15 +3,6 @@
 [](){ #deployment-anyscale }

 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
-
-Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
-
+It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
-
-## Production-ready vLLM on Anyscale quickstarts
-
- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -1,42 +1,26 @@
 # Open WebUI

-[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
-and user-friendly self-hosted AI platform designed to operate entirely offline.
-It supports various LLM runners like Ollama and OpenAI-compatible APIs,
-with built-in RAG capabilities, making it a powerful AI deployment solution.
+1. Install the [Docker](https://docs.docker.com/engine/install/)

-To get started with Open WebUI using vLLM, follow these steps:
+2. Start the vLLM server with the supported chat completion model, e.g.

-1. Install the [Docker](https://docs.docker.com/engine/install/).
+```bash
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```

-2. Start the vLLM server with a supported chat completion model:
+1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):

-    ```console
-    vllm serve Qwen/Qwen3-0.6B-Chat
-    ```
+```bash
+docker run -d -p 3000:8080 \
+--name open-webui \
+-v open-webui:/app/backend/data \
+-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
+--restart always \
+ghcr.io/open-webui/open-webui:main
+```

-    !!! note
-        When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
-        For example:
+1. Open it in the browser: <http://open-webui-host:3000/>

-        ```console
-        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
-        ```
+On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.

-3. Start the Open WebUI Docker container:
-
-    ```console
-    docker run -d \
-        --name open-webui \
-        -p 3000:8080 \
-        -v open-webui:/app/backend/data \
-        -e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
-        --restart always \
-        ghcr.io/open-webui/open-webui:main
-    ```
-
-4. Open it in the browser: <http://open-webui-host:3000/>
-
-    At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
-
-    ![Web portal of model Qwen/Qwen3-0.6B-Chat](../../assets/deployment/open_webui.png)
+![](../../assets/deployment/open_webui.png)
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -31,7 +31,7 @@ Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (cur

 ## KV Cache Transfer Methods

-There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache.
+There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache.

 Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.

@ -39,13 +39,13 @@ Experimental results have shown that the performance of these methods, from high

 As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.

-Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVCache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVCache data itself.
+Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself.

-When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size.
+When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size.

 ## NCCL Group Topology

-Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
+Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.

 ![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36)

@ -53,17 +53,33 @@ Each NCCL group occupies a certain amount of GPU memory buffer for communication

 ## GPU Memory Buffer and Tensor Memory Pool

-The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.
+The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.

-If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVCache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVCache loss. Once KVCache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
+If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.

-To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
+To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.

 # Install vLLM

-```shell
-pip install "vllm>=0.9.2"
-```
+??? console "Commands"
+
+    ```shell
+    # Enter the home directory or your working directory.
+    cd /home
+
+    # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
+    wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+
+    # Download the code repository.
+    git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
+    cd vllm
+
+    # Set the installation package path.
+    export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+
+    # installation
+    pip install -e . -v
+    ```

 # Run xPyD

@ -74,7 +90,7 @@ pip install "vllm>=0.9.2"
 - You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
 - `PUT_ASYNC` offers the best performance and should be prioritized.
 - The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
- The `disagg_proxy_p2p_nccl_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
+- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
 - The node running the proxy must have `quart` installed.
 - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
 - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
@ -84,8 +100,8 @@ pip install "vllm>=0.9.2"
 ### Proxy (e.g. 10.0.1.1)

 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
-python3 disagg_proxy_p2p_nccl_xpyd.py &
+cd {your vllm directory}/examples/online_serving/disagg_xpyd/
+python3 disagg_prefill_proxy_xpyd.py &
 ```

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
@ -95,7 +111,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20001 \
+        --port 20005 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -107,7 +123,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
@ -117,7 +133,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20002 \
+        --port 20009 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -129,7 +145,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
@ -151,7 +167,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
@ -161,7 +177,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20004 \
+        --port 20008 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -173,7 +189,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ## Run 3P1D
@ -181,8 +197,8 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 ### Proxy (e.g. 10.0.1.1)

 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
-python3 disagg_proxy_p2p_nccl_xpyd.py &
+cd {your vllm directory}/examples/online_serving/disagg_xpyd/
+python3 disagg_prefill_proxy_xpyd.py &
 ```

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
@ -192,7 +208,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20001 \
+        --port 20005 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -204,7 +220,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
@ -214,7 +230,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20002 \
+        --port 20009 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -226,7 +242,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
@ -248,7 +264,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
@ -258,7 +274,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20004 \
+        --port 20008 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -270,7 +286,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```

 # Single request
@ -318,6 +334,24 @@ pgrep python | xargs kill -9 && pkill -f python

 # Test data

-## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s
+## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s
+- **1P5D (6×A800) vs vLLM (1×A800)**:
+  - Throughput ↑7.2% (1085 → 6979/6)
+  - ITL (P99) ↓81.3% (120ms → 22.9ms)
+  - TTFT (P99) ↑26.8% (175ms → 222ms)
+  - TPOT: No change

-![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1)
+- **1P6D (7×A800) vs vLLM (1×A800)**:
+  - Throughput ↑9.6% (1085 → 8329/7)
+  - ITL (P99) ↓81.0% (120ms → 22.7ms)
+  - TTFT (P99) ↑210% (175ms →543ms)
+  - TPOT: No change
+
+## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s
+- **1P1D (2×A800) vs vLLM (1×A800)**:
+  - Throughput ↑37.4% (537 → 1476/2)
+  - ITL (P99) ↓81.8% (127ms → 23.1ms)
+  - TTFT (P99) ↑41.8% (160ms → 227ms)
+  - TPOT: No change
+
+![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627)
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -10,7 +10,6 @@ Contents:
 - [BitBLAS](bitblas.md)
 - [GGUF](gguf.md)
 - [GPTQModel](gptqmodel.md)
- [INC](inc.md)
 - [INT4 W4A16](int4.md)
 - [INT8 W8A8](int8.md)
 - [FP8 W8A8](fp8.md)
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@ -1,56 +0,0 @@
---
-title: FP8 INC
---
-[](){ #inc }
-
-vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
-Currently, quantization is validated only in Llama models.
-
-Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
-[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
-
-!!! note
-    Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
-
-!!! note
-    `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
-    The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
-
-## Run Online Inference Using FP8
-
-Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
-
-```bash
-export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
-vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
-```
-
-!!! tip
-    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
-
-!!! tip
-    When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
-    `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
-    `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
-
-## Run Offline Inference Using FP8
-
-To run offline inference (after completing the model calibration process):
-
-* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
-* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
-* Call shutdown method of the model_executor at the end of the run.
-
-```python
-from vllm import LLM
-llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
-...
-# Call llm.generate on the required prompts and sampling params.
-...
-llm.llm_engine.model_executor.shutdown()
-```
-
-## Device for the Model's Weights Uploading
-
-The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
-This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@ -2,19 +2,18 @@

 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:

-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎        | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌      | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ✅︎        | ✅︎           | ✅︎            |
-| FP8 (W8A8)            | ❌      | ❌      | ❌       | ✅︎    | ✅︎      | ✅︎         | ❌          | ❌         | ❌        | ✅︎           | ❌           |
-| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌         | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌      | ❌      | ❌    | ❌      | ❌        | ❌          | ✅︎         | ❌         | ❌           | ❌          |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
+| AWQ                   | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
+| Marlin (GPTQ/AWQ/FP8) | ❌       | ❌        | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ✅︎                | ✅︎           |
+| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ✅︎                | ❌            |
+| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |

 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -14,7 +14,6 @@ vLLM currently supports the following reasoning models:
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |

 !!! note
    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -103,7 +103,9 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m

 vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.

-However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
+By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
+
+Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.

 ## Automatic Function Calling

@ -288,16 +290,6 @@ Supported models:

 Flags: `--tool-call-parser kimi_k2`

-### Hunyuan Models (`hunyuan_a13b`)
-
-Supported models:
-
-* `tencent/Hunyuan-A13B-Instruct` (chat template already included huggingface model file.)
-
-Flags:
-* For non-reasoning: `--tool-call-parser hunyuan_a13b`
-* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
-
 ### Models with Pythonic Tool Calls (`pythonic`)

 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@ -37,7 +37,7 @@ information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp

 - Google Cloud TPU VM
 - TPU versions: v6e, v5e, v5p, v4
- Python: 3.11 or newer
+- Python: 3.10 or newer

 ### Provision Cloud TPUs

@ -117,7 +117,7 @@ source ~/.bashrc
 Create and activate a Conda environment for vLLM:

 ```bash
-conda create -n vllm python=3.12 -y
+conda create -n vllm python=3.10 -y
 conda activate vllm
 ```

--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@ -28,7 +28,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
 hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
 apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
 pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
+pip list | grep neural # verify that neural_compressor is installed
 ```

 Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
@ -120,13 +120,12 @@ docker run \
 - Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
  for accelerating low-batch latency and throughput
 - Attention with Linear Biases (ALiBi)
- INC quantization

 ### Unsupported features

 - Beam search
 - LoRA adapters
- AWQ quantization
+- Quantization
 - Prefill chunking (mixed-batch inferencing)

 ### Supported configurations
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -16,7 +16,6 @@ sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()

 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
 from vllm.utils import FlexibleArgumentParser  # noqa: E402

 logger = logging.getLogger("mkdocs")
@ -25,18 +24,15 @@ logger = logging.getLogger("mkdocs")
 class MarkdownFormatter(HelpFormatter):
    """Custom formatter that generates markdown for argument groups."""

-    def __init__(self, prog, starting_heading_level=3):
+    def __init__(self, prog):
        super().__init__(prog,
                         max_help_position=float('inf'),
                         width=float('inf'))
-        self._section_heading_prefix = "#" * starting_heading_level
-        self._argument_heading_prefix = "#" * (starting_heading_level + 1)
        self._markdown_output = []

    def start_section(self, heading):
        if heading not in {"positional arguments", "options"}:
-            heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
-            self._markdown_output.append(heading_md)
+            self._markdown_output.append(f"\n### {heading}\n\n")

    def end_section(self):
        pass
@ -50,13 +46,9 @@ class MarkdownFormatter(HelpFormatter):

    def add_arguments(self, actions):
        for action in actions:
-            if (len(action.option_strings) == 0
-                    or "--help" in action.option_strings):
-                continue

            option_strings = f'`{"`, `".join(action.option_strings)}`'
-            heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
-            self._markdown_output.append(heading_md)
+            self._markdown_output.append(f"#### {option_strings}\n\n")

            if choices := action.choices:
                choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -89,14 +81,6 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
        return cls.add_cli_args(parser, **kwargs)


-def create_serve_parser() -> FlexibleArgumentParser:
-    """Create a parser for the serve command with markdown formatting."""
-    parser = FlexibleArgumentParser()
-    parser.formatter_class = lambda prog: MarkdownFormatter(
-        prog, starting_heading_level=4)
-    return make_arg_parser(parser)
-
-
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    logger.info("Generating argparse documentation")
    logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -111,7 +95,6 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "engine_args": create_parser(EngineArgs),
        "async_engine_args": create_parser(AsyncEngineArgs,
                                           async_args_only=True),
-        "serve": create_serve_parser(),
    }

    # Generate documentation for each parser
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -331,7 +331,6 @@ Specified using `--task generate`.
 | `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | | ✅︎ | ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
 | `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
 | `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -360,7 +359,6 @@ Specified using `--task generate`.
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
-| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -382,9 +380,9 @@ Specified using `--task generate`.
 | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -575,7 +573,7 @@ Specified using `--task generate`.
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinkg`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
@ -583,7 +581,6 @@ Specified using `--task generate`.
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@ -30,31 +30,8 @@ This API adds several batteries-included capabilities that simplify large-scale,
 - Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance.
 - Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization.
 - Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference.
- Reading and writing to most popular file formats and cloud object storage.
- Scaling up the workload without code changes.

-??? code
-
-    ```python
-    import ray  # Requires ray>=2.44.1
-    from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
-
-    config = vLLMEngineProcessorConfig(model_source="unsloth/Llama-3.2-1B-Instruct")
-    processor = build_llm_processor(
-        config,
-        preprocess=lambda row: {
-            "messages": [
-                {"role": "system", "content": "You are a bot that completes unfinished haikus."},
-                {"role": "user", "content": row["item"]},
-            ],
-            "sampling_params": {"temperature": 0.3, "max_tokens": 250},
-        },
-        postprocess=lambda row: {"answer": row["generated_text"]},
-    )
-
-    ds = ray.data.from_items(["An old silent pond..."])
-    ds = processor(ds)
-    ds.write_parquet("local:///tmp/data/")
-    ```
+The following example shows how to run batched inference with Ray Data and vLLM:
+<gh-file:examples/offline_inference/batch_llm_inference.py>

 For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i

 Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
 Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
-(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
+(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
 enforcing eager mode and disabling prefix caching in V1.

 Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -10,7 +10,7 @@ on HuggingFace model repository.

 import os
 from dataclasses import asdict
-from typing import Any, NamedTuple, Optional
+from typing import NamedTuple, Optional

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -30,9 +30,7 @@ question_per_audio_count = {

 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[dict[str, list[int]]] = None
-    multi_modal_data: Optional[dict[str, Any]] = None
+    prompt: str
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None

@ -42,60 +40,6 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


-# Voxtral
-def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
-    from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.messages import (
-        AudioChunk,
-        RawAudio,
-        TextChunk,
-        UserMessage,
-    )
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-
-    model_name = "mistralai/Voxtral-Mini-3B-2507"
-    tokenizer = MistralTokenizer.from_hf_hub(model_name)
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"audio": audio_count},
-        config_format="mistral",
-        load_format="mistral",
-        tokenizer_mode="mistral",
-        enforce_eager=True,
-        enable_chunked_prefill=False,
-    )
-
-    text_chunk = TextChunk(text=question)
-    audios = [
-        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
-        for i in range(audio_count)
-    ]
-    audio_chunks = [
-        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
-    ]
-
-    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
-
-    req = ChatCompletionRequest(messages=messages, model=model_name)
-
-    tokens = tokenizer.encode_chat_completion(req)
-    prompt_ids, audios = tokens.tokens, tokens.audios
-
-    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
-
-    multi_modal_data = {"audio": audios_and_sr}
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt_token_ids=prompt_ids,
-        multi_modal_data=multi_modal_data,
-    )
-
-
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
    # NOTE - the setting in this example are somehat different than what is
@ -299,7 +243,6 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:


 model_example_map = {
-    "voxtral": run_voxtral,
    "granite_speech": run_granite_speech,
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
@ -368,24 +311,16 @@ def main(args):
        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
    )

-    mm_data = req_data.multi_modal_data
-    if not mm_data:
-        mm_data = {}
-        if audio_count > 0:
-            mm_data = {
-                "audio": [
-                    asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
-                ]
-            }
+    mm_data = {}
+    if audio_count > 0:
+        mm_data = {
+            "audio": [
+                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
+            ]
+        }

    assert args.num_prompts > 0
-    inputs = {"multi_modal_data": mm_data}
-
-    if req_data.prompt:
-        inputs["prompt"] = req_data.prompt
-    else:
-        inputs["prompt_token_ids"] = req_data.prompt_token_ids
-
+    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
--- a/examples/offline_inference/skip_loading_weights_in_engine_init.py
+++ b/examples/offline_inference/skip_loading_weights_in_engine_init.py
@ -1,53 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, RequestOutput, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def print_prompts_and_outputs(outputs: list[RequestOutput]) -> None:
-    print("-" * 60)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt:    {prompt!r}")
-        print(f"Output:    {generated_text!r}")
-        print("-" * 60)
-
-
-def main():
-    # Create an LLM without loading real weights
-    llm = LLM(
-        model="Qwen/Qwen3-0.6B",
-        load_format="dummy",
-        enforce_eager=True,
-        tensor_parallel_size=4,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-    print("\nOutputs do not make sense:")
-    print_prompts_and_outputs(outputs)
-
-    # Update load format from `dummy` to `auto`
-    llm.collective_rpc(
-        "update_config", args=({"load_config": {"load_format": "auto"}},)
-    )
-    # Now reload real weights inplace
-    llm.collective_rpc("reload_weights")
-
-    # Check outputs make sense
-    outputs = llm.generate(prompts, sampling_params)
-    print("\nOutputs make sense after loading real weights:")
-    print_prompts_and_outputs(outputs)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -84,7 +84,6 @@ def main():
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
        disable_log_stats=False,
-        max_model_len=16384,
    )

    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -429,44 +429,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Nemontron_VL
-def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    assert modality == "image"
-    placeholder = "<image>"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@ -1224,7 +1186,6 @@ model_example_map = {
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
-    "nemotron_vl": run_nemotron_vl,
    "keye_vl": run_keye_vl,
    "kimi_vl": run_kimi_vl,
    "llava": run_llava,
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@ -4,9 +4,7 @@
 import os
 import socket
 import threading
-import time
 import uuid
-from typing import Any

 import aiohttp
 import msgpack
@ -14,25 +12,12 @@ import zmq
 from quart import Quart, make_response, request

 count = 0
-prefill_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
-decode_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address

 prefill_cv = threading.Condition()
 decode_cv = threading.Condition()

-DEFAULT_PING_SECONDS = 5
-
-
-def _remove_oldest_instances(instances: dict[str, Any]) -> None:
-    oldest_key = next(iter(instances), None)
-    while oldest_key is not None:
-        value = instances[oldest_key]
-        if value[1] > time.time():
-            break
-        print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
-        instances.pop(oldest_key, None)
-        oldest_key = next(iter(instances), None)
-

 def _listen_for_register(poller, router_socket):
    while True:
@ -46,23 +31,12 @@ def _listen_for_register(poller, router_socket):
                global prefill_instances
                global prefill_cv
                with prefill_cv:
-                    node = prefill_instances.pop(data["http_address"], None)
-                    prefill_instances[data["http_address"]] = (
-                        data["zmq_address"],
-                        time.time() + DEFAULT_PING_SECONDS,
-                    )
-                    _remove_oldest_instances(prefill_instances)
-
+                    prefill_instances[data["http_address"]] = data["zmq_address"]
            elif data["type"] == "D":
                global decode_instances
                global decode_cv
                with decode_cv:
-                    node = decode_instances.pop(data["http_address"], None)
-                    decode_instances[data["http_address"]] = (
-                        data["zmq_address"],
-                        time.time() + DEFAULT_PING_SECONDS,
-                    )
-                    _remove_oldest_instances(decode_instances)
+                    decode_instances[data["http_address"]] = data["zmq_address"]
            else:
                print(
                    "Unexpected, Received message from %s, data: %s",
@ -70,9 +44,6 @@ def _listen_for_register(poller, router_socket):
                    data,
                )

-            if node is None:
-                print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
-

 def start_service_discovery(hostname, port):
    if not hostname:
@ -134,14 +105,12 @@ async def handle_request():
        with prefill_cv:
            prefill_list = list(prefill_instances.items())
            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
-            prefill_zmq_addr = prefill_zmq_addr[0]

        global decode_instances
        global decode_cv
        with decode_cv:
            decode_list = list(decode_instances.items())
            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
-            decode_zmq_addr = decode_zmq_addr[0]

        print(
            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
--- a/examples/tool_chat_template_hunyuan_a13b.jinja
+++ b/examples/tool_chat_template_hunyuan_a13b.jinja
@ -1,113 +0,0 @@
-{% set loop_messages = messages %}
-{% if tools %}
-    {% set weekday_map = {'Monday': '星期一', 'Tuesday': '星期二', 'Wednesday': '星期三', 'Thursday': '星期四', 'Friday': '星期五', 'Saturday': '星期六', 'Sunday': '星期日'} %}
-    {% set weekday_cn = weekday_map[strftime_now('%A')] %}
-    {% set datetime_str = strftime_now('%Y-%m-%d %H:%M:%S') %}
-    {% set datetime_str = datetime_str + ' ' + weekday_cn %}
-    {% for message in loop_messages %}
-        {% if 'content' in message %}
-            {% set content = message['content'] %}
-        {% else %}
-            {% set content = '' %}
-        {% endif %}
-        {% if loop.index0 == 0 %}
-            {% set content_tmp = '你是一位函数组合专家。你会得到一个问题和一组可能的函数。根据问题，你需要进行一个或多个函数/工具调用以实现目的。
-如果没有一个函数可以使用，请直接使用自然语言回复用户，以助手：开头。
-如果给定的问题缺少函数所需的参数，请使用自然语言进行提问，向用户询问必要信息，以助手：开头。
-如果调用结果已经足够回答用户问题，请对历史结果进行总结，使用自然语言回复用户，以助手：开头。
-你应该只在工具调用部分返回函数调用。如果你决定调用任何函数，你必须将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>。你不应该在回复中包含任何其他文本。以下是你可以调用的函数列表，格式为JSON。
-' %}
-            {% set content_tmp = content_tmp + '
-' + tools | tojson + '
-' %}
-            {% if message['role'] == 'system' %}
-                {% set content_tmp = content_tmp + '
-额外要求：
-' + content + '
-
-如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。如果额外要求里有格式要求，请忽略，以此处为准。
-否则，请参考开头说的三种情况，以助手：开头进行回复。
-
-如果额外要求里有时间信息，就以额外要求里的时间为准，否则，参考当前时间：' + datetime_str %}
-                {% set content = '<|startoftext|>' + content_tmp + '<|extra_4|>' %}
-            {% elif message['role'] == 'user' %}
-                {% set content_tmp = content_tmp + '
-如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。
-否则，请参考开头说的三种情况，以助手：开头进行回复。
-
-当前时间：' + datetime_str %}
-                {% set content_tmp = '<|startoftext|>' + content_tmp + '<|extra_4|>'%}
-                {% set content = content_tmp + '用户：' + content + '<|extra_0|>' %}
-            {% endif %}
-        {% else %}
-            {% if message['role'] == 'user' %}
-                {% set content = '用户：' + content + '<|extra_0|>' %}
-            {% elif message['role'] == 'assistant' %}
-                {% if 'tool_calls' in message %}
-                    {% set tool_calls = message['tool_calls'] %}
-                    {% set ns = namespace(tool_calls="[") %}
-                    {% for tool_call in tool_calls %}
-                        {% set function = tool_call['function'] %}
-                        {% set name = function['name'] %}
-                        {% set ns.tool_calls = ns.tool_calls + '{"name": "' + name + '", '%}
-                        {% set arguments = function['arguments'] %}
-                        {% if arguments is not string %}
-                            {% set arguments = arguments | tojson %}
-                        {% endif %}
-                        {% set ns.tool_calls = ns.tool_calls + '"arguments": ' + arguments + '}' %}
-                        {% if not loop.last %}
-                            {% set ns.tool_calls = ns.tool_calls + ', '%}
-                        {% endif %}
-                    {% endfor %}
-                    {% set ns.tool_calls = ns.tool_calls + ']' %}
-                    {% set content = content + '<tool_calls>' + ns.tool_calls + '</tool_calls>' %}
-                {% else %}
-                    {% set content = '助手：' + content %}
-                {% endif %}
-                {% set content = content + '<|eos|>' %}
-            {% elif message['role'] == 'tool' %}
-                {% if content is not string %}
-                    {set content = content | tojson }
-                {% endif %}
-                {% set content = '<tool_response>' + content + '</tool_response>' %}
-                {% set content = content + '<|extra_0|>' %}
-            {% endif %}
-        {% endif %}
-    {{- content -}}
-    {% endfor %}
-{% else %}
-    {% set context = {'has_head': true} %}
-    {% for message in loop_messages %}
-        {% if 'content' in message %}
-            {% set content = message['content'] %}
-        {% else %}
-            {% set content = '' %}
-        {% endif %}
-        {% if loop.index0 == 0 %}
-            {% if content == '' %}
-                {% set _ = context.update({'has_head': false}) %}
-            {% elif message['role'] == 'system' %}
-                {% set content = '<|startoftext|>' + content + '<|extra_4|>' %}
-            {% endif %}
-        {% endif %}
-        {% if message['role'] == 'user' %}
-            {% if loop.index0 == 1 and not context.has_head %}
-                {% set content = '<|startoftext|>' + content %}
-            {% endif %}
-            {% if loop.index0 == 1 and context.has_head %}
-                {% set content = content + '<|extra_0|>' %}
-            {% else %}
-                {% set content = '<|startoftext|>' + content + '<|extra_0|>' %}
-            {% endif %}
-        {% elif message['role'] == 'assistant' %}
-            {% set content = content + '<|eos|>' %}
-        {% elif message['role'] == 'tool' %}
-            {% set content = content + '<|extra_0|>' %}
-        {% endif %}
-        {{- content -}}
-    {% endfor %}
-{% endif %}
-{%- if enable_thinking is defined and enable_thinking is false %}
-    {{- '<think>\n\n</think>\n' }}
-{%- endif %}
-
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<80.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.7.1",
+    "torch == 2.7.0",
    "wheel",
    "jinja2",
 ]
@ -174,186 +174,3 @@ respect-ignore-files = true

 [tool.ty.environment]
 python = "./.venv"
-
-[tool.typos.files]
-# these files may be written in non english words
-extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
-    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
-    "vllm/third_party/*"]
-ignore-hidden = true
-ignore-files = true
-ignore-dot = true
-ignore-vcs = true
-ignore-global = true
-ignore-parent = true
-
-[tool.typos.default]
-binary = false
-check-filename = false
-check-file = true
-unicode = true
-ignore-hex = true
-identifier-leading-digits = false
-locale = "en"
-extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
-    ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
-     ".*[Tt]h[rR].*"]
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.default.extend-identifiers]
-bbc5b7ede = "bbc5b7ede"
-womens_doubles = "womens_doubles"
-v_2nd = "v_2nd"
-# splitted_input = "splitted_input"
-NOOPs = "NOOPs"
-typ = "typ"
-nin_shortcut = "nin_shortcut"
-UperNetDecoder = "UperNetDecoder"
-subtile = "subtile"
-cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
-SFOuput = "SFOuput"
-# huggingface transformers repo uses these words
-depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
-DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
-depthwise_seperable_CNN = "depthwise_seperable_CNN"
-
-[tool.typos.default.extend-words]
-iy = "iy"
-tendencias = "tendencias"
-# intel cpu features
-tme = "tme"
-dout = "dout"
-Pn = "Pn"
-arange = "arange"
-
-[tool.typos.type.py]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.py.extend-identifiers]
-arange = "arange"
-NDArray = "NDArray"
-EOFError = "EOFError"
-fo = "fo"
-ba = "ba"
-
-[tool.typos.type.py.extend-words]
-
-[tool.typos.type.cpp]
-extend-glob = ["*.cu"]
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.cpp.extend-identifiers]
-countr_one = "countr_one"
-k_ot = "k_ot"
-ot = "ot"
-
-[tool.typos.type.cpp.extend-words]
-
-[tool.typos.type.rust]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.rust.extend-identifiers]
-flate2 = "flate2"
-
-[tool.typos.type.rust.extend-words]
-ser = "ser"
-
-[tool.typos.type.lock]
-extend-glob = []
-check-file = false
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.lock.extend-identifiers]
-
-[tool.typos.type.lock.extend-words]
-
-[tool.typos.type.jl]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.jl.extend-identifiers]
-
-[tool.typos.type.jl.extend-words]
-modul = "modul"
-egals = "egals"
-usig = "usig"
-egal = "egal"
-
-[tool.typos.type.go]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.go.extend-identifiers]
-flate = "flate"
-
-[tool.typos.type.go.extend-words]
-
-[tool.typos.type.css]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.css.extend-identifiers]
-nd = "nd"
-
-[tool.typos.type.css.extend-words]
-
-[tool.typos.type.man]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.man.extend-identifiers]
-Nd = "Nd"
-
-[tool.typos.type.man.extend-words]
-
-[tool.typos.type.cert]
-extend-glob = []
-check-file = false
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.cert.extend-identifiers]
-
-[tool.typos.type.cert.extend-words]
-
-[tool.typos.type.sh]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.sh.extend-identifiers]
-ot = "ot"
-
-[tool.typos.type.sh.extend-words]
-
-[tool.typos.type.vimscript]
-extend-glob = []
-extend-ignore-identifiers-re = []
-extend-ignore-words-re = []
-extend-ignore-re = []
-
-[tool.typos.type.vimscript.extend-identifiers]
-windo = "windo"
-
-[tool.typos.type.vimscript.extend-words]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.1
+torch==2.7.0
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -25,7 +25,7 @@ outlines_core == 0.2.10
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata; python_version < '3.10'
-mistral_common[opencv] >= 1.8.0
+mistral_common[opencv] >= 1.6.2
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.1
-torchaudio==2.7.1
+torch==2.7.0
+torchaudio==2.7.0
 # These must be updated alongside torch
-torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
+xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -17,7 +17,6 @@ cloudpickle
 fastapi
 msgspec
 openai
-partial-json-parser
 pillow
 psutil
 pybase64
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@ -0,0 +1,12 @@
+# Common dependencies
+-r common.txt
+
+# Dependencies for HPU code
+ray
+triton==3.1.0
+pandas
+numpy==1.26.4
+tabulate
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.8.0 # required for voxtral test
+mistral_common[opencv] >= 1.6.2 # required for pixtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
--- a/requirements/test.in
+++ b/requirements/test.in
@ -22,15 +22,14 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.8.0 # required for voxtral test
+mistral_common[opencv] >= 1.7.0 # required for pixtral test
 num2words # required for smolvlm test
-open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -174,8 +174,6 @@ fsspec==2024.9.0
    #   fastparquet
    #   huggingface-hub
    #   torch
-ftfy==6.3.1
-    # via open-clip-torch
 genai-perf==0.0.8
    # via -r requirements/test.in
 genson==1.3.0
@ -210,7 +208,6 @@ huggingface-hub==0.33.0
    #   accelerate
    #   datasets
    #   evaluate
-    #   open-clip-torch
    #   peft
    #   sentence-transformers
    #   timm
@ -308,7 +305,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.8.0
+mistral-common==1.7.0
    # via -r requirements/test.in
 more-itertools==10.5.0
    # via lm-eval
@ -417,8 +414,6 @@ nvidia-nvjitlink-cu12==12.8.61
    #   torch
 nvidia-nvtx-cu12==12.8.55
    # via torch
-open-clip-torch==2.32.0
-    # via -r requirements/test.in
 opencensus==0.11.4
    # via ray
 opencensus-context==0.1.3
@ -523,8 +518,6 @@ pyasn1-modules==0.4.2
    # via google-auth
 pybind11==2.13.6
    # via lm-eval
-pycountry==24.6.1
-    # via pydantic-extra-types
 pycparser==2.22
    # via cffi
 pycryptodomex==3.22.0
@ -535,12 +528,9 @@ pydantic==2.11.5
    #   datamodel-code-generator
    #   mistral-common
    #   mteb
-    #   pydantic-extra-types
    #   ray
 pydantic-core==2.33.2
    # via pydantic
-pydantic-extra-types==2.10.5
-    # via mistral-common
 pygments==2.18.0
    # via rich
 pyparsing==3.2.0
@ -620,7 +610,6 @@ referencing==0.35.1
 regex==2024.9.11
    # via
    #   nltk
-    #   open-clip-torch
    #   sacrebleu
    #   tiktoken
    #   transformers
@ -671,7 +660,6 @@ sacrebleu==2.4.3
 safetensors==0.4.5
    # via
    #   accelerate
-    #   open-clip-torch
    #   peft
    #   timm
    #   transformers
@ -760,9 +748,7 @@ tiktoken==0.7.0
    #   lm-eval
    #   mistral-common
 timm==1.0.11
-    # via
-    #   -r requirements/test.in
-    #   open-clip-torch
+    # via -r requirements/test.in
 tokenizers==0.21.1
    # via
    #   -r requirements/test.in
@ -771,7 +757,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.7.1+cu128
+torch==2.7.0+cu128
    # via
    #   -r requirements/test.in
    #   accelerate
@ -781,7 +767,6 @@ torch==2.7.1+cu128
    #   lm-eval
    #   mamba-ssm
    #   mteb
-    #   open-clip-torch
    #   peft
    #   runai-model-streamer
    #   sentence-transformers
@ -791,15 +776,14 @@ torch==2.7.1+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.7.0+cu128
    # via
    #   -r requirements/test.in
    #   encodec
    #   vocos
-torchvision==0.22.1+cu128
+torchvision==0.22.0+cu128
    # via
    #   -r requirements/test.in
-    #   open-clip-torch
    #   timm
 tqdm==4.66.6
    # via
@ -809,7 +793,6 @@ tqdm==4.66.6
    #   lm-eval
    #   mteb
    #   nltk
-    #   open-clip-torch
    #   peft
    #   pqdm
    #   sentence-transformers
@ -828,7 +811,7 @@ transformers==4.53.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.3.1
+triton==3.3.0
    # via torch
 tritonclient==2.51.0
    # via
@ -852,7 +835,6 @@ typing-extensions==4.12.2
    #   pqdm
    #   pydantic
    #   pydantic-core
-    #   pydantic-extra-types
    #   torch
    #   typer
    #   typing-inspection
@ -875,8 +857,6 @@ virtualenv==20.31.2
    # via ray
 vocos==0.1.0
    # via -r requirements/test.in
-wcwidth==0.2.13
-    # via ftfy
 webcolors==24.11.1
    # via jsonschema
 werkzeug==3.1.3
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -18,8 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250716
-torchvision==0.24.0.dev20250716
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+torch==2.9.0.dev20250711
+torchvision==0.24.0.dev20250711
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250711-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

--- a/setup.py
+++ b/setup.py
@ -410,6 +410,29 @@ class repackage_wheel(build_ext):
                package_data[package_name].append(file_name)


+def _is_hpu() -> bool:
+    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
+    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
+        return VLLM_TARGET_DEVICE == "hpu"
+
+    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
+    # and if it doesn't, check if habanalabs driver is loaded
+    is_hpu_available = False
+    try:
+        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
+        is_hpu_available = out.returncode == 0
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if sys.platform.startswith("linux"):
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                pass
+    return is_hpu_available
+
+
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"

@ -417,7 +440,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+            and not (_is_neuron() or _is_tpu() or _is_hpu()))


 def _is_hip() -> bool:
@ -550,6 +573,12 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"{sep}neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version())
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"{sep}gaudi{gaudi_sw_version}"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@ -596,6 +625,8 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("neuron.txt")
+    elif _is_hpu():
+        requirements = _read_requirements("hpu.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@ -604,7 +635,8 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
+            "or CPU.")
    return requirements


@ -660,8 +692,7 @@ setup(
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
-        "audio": ["librosa", "soundfile",
-                  "mistral_common[audio]"],  # Required for audio processing
+        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": []  # Kept for backwards compatibility
    },
    cmdclass=cmdclass,
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
-def api_server(distributed_executor_backend: str):
+def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@ -40,6 +40,8 @@ def api_server(distributed_executor_backend: str):
        "facebook/opt-125m",
        "--host",
        "127.0.0.1",
+        "--tokenizer-pool-size",
+        str(tokenizer_pool_size),
        "--distributed-executor-backend",
        distributed_executor_backend,
    ]
@ -52,8 +54,10 @@ def api_server(distributed_executor_backend: str):
    uvicorn_process.terminate()


+@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, distributed_executor_backend: str):
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    distributed_executor_backend: str):
    """
    Run the API server and test it.

--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -804,7 +804,7 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[int]],
+        prompts: Union[list[str], list[torch.Tensor]],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
@ -826,16 +826,11 @@ class VllmRunner:
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio

-            text_prompt_kwargs: dict[str, Any] = {
+            text_prompt_kwargs = {
+                ("prompt" if isinstance(prompt, str) else "prompt_embeds"):
+                prompt,
                "multi_modal_data": multi_modal_data or None
            }
-            if isinstance(prompt, str):
-                text_prompt_kwargs["prompt"] = prompt
-            elif isinstance(prompt, list):
-                text_prompt_kwargs["prompt_token_ids"] = prompt
-            else:
-                text_prompt_kwargs["prompt_embeds"] = prompt
-
            inputs.append(TextPrompt(**text_prompt_kwargs))

        return inputs
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -14,9 +14,8 @@ from typing import Literal, NamedTuple, Optional

 import pytest

-from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
+from vllm.config import TaskOption
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_config

 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, create_new_process_for_each_test
@ -159,7 +158,7 @@ TEXT_GENERATION_MODELS = {
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
@ -211,11 +210,9 @@ TEXT_GENERATION_MODELS = {

 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
-        load_format="dummy", task="embed"
-    ),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
 }

 MULTIMODAL_MODELS = {
@ -251,7 +248,6 @@ TEST_MODELS = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "ArthurZ/Ilama-3.2-1B",
    "ibm/PowerLM-3b",
-    "deepseek-ai/DeepSeek-V2-Lite-Chat",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
@ -291,11 +287,6 @@ def _compare_tp(
    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
-    hf_config = get_config(model_id, trust_remote_code)
-
-    dtype = "float16"
-    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
-        dtype = "bfloat16"

    if load_format == "dummy":
        # Avoid OOM
@ -325,7 +316,7 @@ def _compare_tp(
    common_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        dtype,
+        "float16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
@ -347,7 +338,6 @@ def _compare_tp(
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
-    testing_ray_compiled_graph = False
    if distributed_backend == "ray" and (vllm_major_version == "1"
                                         or specific_case):
        # For V1, test Ray Compiled Graph for all the tests
@ -361,7 +351,6 @@ def _compare_tp(
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of a Ray Compiled Graph issue.
        common_args.append("--disable-frontend-multiprocessing")
-        testing_ray_compiled_graph = True
    elif distributed_backend == "mp":
        # Both V0/V1 of multiprocessing executor support PP
        pp_env = {
@ -405,6 +394,7 @@ def _compare_tp(
                             tp_env,
                             method=method)
    except Exception:
+        testing_ray_compiled_graph = pp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
            # Ray Compiled Graph tests are flaky for V0,
            # so we don't want to fail the test
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
-from argparse import ArgumentError
+from argparse import ArgumentError, ArgumentTypeError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Annotated, Literal, Optional
@ -12,8 +12,8 @@ import pytest
 from vllm.config import CompilationConfig, config
 from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
                                   get_type, get_type_hints, is_not_builtin,
-                                   is_type, literal_to_kwargs, optional_type,
-                                   parse_type)
+                                   is_type, literal_to_kwargs, nullable_kvs,
+                                   optional_type, parse_type)
 from vllm.utils import FlexibleArgumentParser


@ -25,10 +25,18 @@ from vllm.utils import FlexibleArgumentParser
        "foo": 1,
        "bar": 2
    }),
+    (json.loads, "foo=1,bar=2", {
+        "foo": 1,
+        "bar": 2
+    }),
 ])
 def test_parse_type(type, value, expected):
    parse_type_func = parse_type(type)
-    assert parse_type_func(value) == expected
+    context = nullcontext()
+    if value == "foo=1,bar=2":
+        context = pytest.warns(DeprecationWarning)
+    with context:
+        assert parse_type_func(value) == expected


 def test_optional_type():
@ -195,6 +203,34 @@ def test_get_kwargs():
    assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4


+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, dict()),
+    ("image=16", {
+        "image": 16
+    }),
+    ("image=16,video=2", {
+        "image": 16,
+        "video": 2
+    }),
+    ("Image=16, Video=2", {
+        "image": 16,
+        "video": 2
+    }),
+])
+def test_limit_mm_per_prompt_parser(arg, expected):
+    """This functionality is deprecated and will be removed in the future.
+    This argument should be passed as JSON string instead.
+    
+    TODO: Remove with nullable_kvs."""
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--limit-mm-per-prompt", arg])
+
+    assert args.limit_mm_per_prompt == expected
+
+
@pytest.mark.parametrize(
    ("arg", "expected"),
    [
@ -290,6 +326,18 @@ def test_prefix_cache_default():
    assert not engine_args.enable_prefix_caching


+@pytest.mark.parametrize(
+    ("arg"),
+    [
+        "image",  # Missing =
+        "image=4,image=5",  # Conflicting values
+        "image=video=4"  # Too many = in tokenized arg
+    ])
+def test_bad_nullable_kvs(arg):
+    with pytest.raises(ArgumentTypeError):
+        nullable_kvs(arg)
+
+
 # yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
    (None, None, "mm-processor-kwargs"),
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@ -176,8 +176,4 @@ async def test_invocations(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
-    for classification_data, invocation_data in zip(
-            classification_output["data"], invocation_output["data"]):
-        assert classification_data.keys() == invocation_data.keys()
-        assert classification_data["probs"] == pytest.approx(
-            invocation_data["probs"], rel=0.01)
+    assert classification_output["data"] == invocation_output["data"]
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@ -14,7 +14,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...models.language.pooling.embed_utils import (
    run_embedding_correctness_test)
-from ...models.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "intfloat/multilingual-e5-small"
@ -322,13 +321,7 @@ async def test_invocations(server: RemoteOpenAIServer,
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
-        assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="completion",
-                               name_1="invocation")
+    assert completion_output["data"] == invocation_output["data"]


@pytest.mark.asyncio
@ -362,10 +355,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
-        assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="chat",
-                               name_1="invocation")
+    assert chat_output["data"] == invocation_output["data"]
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
 from typing import Final

 import pytest
@ -30,7 +29,7 @@ def server():
        "--enforce-eager",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
-        json.dumps({"image": MAXIMUM_IMAGES}),
+        f"image={MAXIMUM_IMAGES}",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@ -281,13 +281,7 @@ async def test_invocations(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
-        assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=completion_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="completion",
-                               name_1="invocation")
+    assert completion_output["data"] == invocation_output["data"]


@pytest.mark.asyncio
@ -320,10 +314,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
-        assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=chat_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="chat",
-                               name_1="invocation")
+    assert chat_output["data"] == invocation_output["data"]
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@ -120,8 +120,4 @@ def test_invocations(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert rerank_output.keys() == invocation_output.keys()
-    for rerank_result, invocations_result in zip(rerank_output["results"],
-                                                 invocation_output["results"]):
-        assert rerank_result.keys() == invocations_result.keys()
-        assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.01)
+    assert rerank_output["results"] == invocation_output["results"]
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@ -215,8 +215,4 @@ class TestModel:
        invocation_output = invocation_response.json()

        assert score_output.keys() == invocation_output.keys()
-        for score_data, invocation_data in zip(score_output["data"],
-                                               invocation_output["data"]):
-            assert score_data.keys() == invocation_data.keys()
-            assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.01)
+        assert score_output["data"] == invocation_output["data"]
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@ -32,7 +32,6 @@ def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
        "--max-lora-rank",
        "64",
-        "--enable-tokenizer-info-endpoint",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@ -284,106 +283,3 @@ async def test_detokenize(
    response.raise_for_status()

    assert response.json() == {"prompt": prompt}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name,tokenizer_name",
-    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
-    indirect=["tokenizer_name"],
-)
-async def test_tokenizer_info_basic(
-    server: RemoteOpenAIServer,
-    model_name: str,
-    tokenizer_name: str,
-):
-    """Test basic tokenizer info endpoint functionality."""
-    response = requests.get(server.url_for("tokenizer_info"))
-    response.raise_for_status()
-    result = response.json()
-    assert "tokenizer_class" in result
-    assert isinstance(result["tokenizer_class"], str)
-    assert result["tokenizer_class"]
-
-
-@pytest.mark.asyncio
-async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
-    """Test that the response matches expected schema types."""
-    response = requests.get(server.url_for("tokenizer_info"))
-    response.raise_for_status()
-    result = response.json()
-    field_types = {
-        "add_bos_token": bool,
-        "add_prefix_space": bool,
-        "clean_up_tokenization_spaces": bool,
-        "split_special_tokens": bool,
-        "bos_token": str,
-        "eos_token": str,
-        "pad_token": str,
-        "unk_token": str,
-        "chat_template": str,
-        "errors": str,
-        "model_max_length": int,
-        "additional_special_tokens": list,
-        "added_tokens_decoder": dict,
-    }
-    for field, expected_type in field_types.items():
-        if field in result and result[field] is not None:
-            assert isinstance(
-                result[field],
-                expected_type), (f"{field} should be {expected_type.__name__}")
-
-
-@pytest.mark.asyncio
-async def test_tokenizer_info_added_tokens_structure(
-    server: RemoteOpenAIServer, ):
-    """Test added_tokens_decoder structure if present."""
-    response = requests.get(server.url_for("tokenizer_info"))
-    response.raise_for_status()
-    result = response.json()
-    added_tokens = result.get("added_tokens_decoder")
-    if added_tokens:
-        for token_id, token_info in added_tokens.items():
-            assert isinstance(token_id, str), "Token IDs should be strings"
-            assert isinstance(token_info, dict), "Token info should be a dict"
-            assert "content" in token_info, "Token info should have content"
-            assert "special" in token_info, (
-                "Token info should have special flag")
-            assert isinstance(token_info["special"],
-                              bool), ("Special flag should be boolean")
-
-
-@pytest.mark.asyncio
-async def test_tokenizer_info_consistency_with_tokenize(
-    server: RemoteOpenAIServer, ):
-    """Test that tokenizer info is consistent with tokenization endpoint."""
-    info_response = requests.get(server.url_for("tokenizer_info"))
-    info_response.raise_for_status()
-    info = info_response.json()
-    tokenize_response = requests.post(
-        server.url_for("tokenize"),
-        json={
-            "model": MODEL_NAME,
-            "prompt": "Hello world!"
-        },
-    )
-    tokenize_response.raise_for_status()
-    tokenize_result = tokenize_response.json()
-    info_max_len = info.get("model_max_length")
-    tokenize_max_len = tokenize_result.get("max_model_len")
-    if info_max_len and tokenize_max_len:
-        assert info_max_len >= tokenize_max_len, (
-            "Info max length should be >= tokenize max length")
-
-
-@pytest.mark.asyncio
-async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
-    """Test chat template is properly included."""
-    response = requests.get(server.url_for("tokenizer_info"))
-    response.raise_for_status()
-    result = response.json()
-    chat_template = result.get("chat_template")
-    if chat_template:
-        assert isinstance(chat_template,
-                          str), ("Chat template should be a string")
-        assert chat_template.strip(), "Chat template should not be empty"
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -17,11 +17,6 @@ from vllm.assets.audio import AudioAsset

 from ...utils import RemoteOpenAIServer

-MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
-]
-

@pytest.fixture
 def mary_had_lamb():
@ -38,15 +33,9 @@ def winning_call():


@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
-async def test_basic_audio(mary_had_lamb, model_name):
+async def test_basic_audio(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
-
-    if model_name.startswith("mistralai"):
-        server_args += MISTRAL_FORMAT_ARGS
-
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -76,13 +65,10 @@ async def test_bad_requests(mary_had_lamb):


@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
-async def test_long_audio_request(mary_had_lamb, model_name):
+async def test_long_audio_request(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]

-    if model_name.startswith("openai"):
-        return
-
    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
    # Add small silence after each audio for repeatability in the split process
@ -101,8 +87,7 @@ async def test_long_audio_request(mary_had_lamb, model_name):
            response_format="text",
            temperature=0.0)
        out = json.loads(transcription)['text']
-        counts = out.count("Mary had a little lamb")
-        assert counts == 10, counts
+        assert out.count("Mary had a little lamb") == 10


@pytest.mark.asyncio
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -36,11 +36,11 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
    [
        "The image shows a Venn diagram with three over",
-        "The image shows a Venn diagram with three intersect",
+        "This image shows a Venn diagram with three over",
    ],
    [
        "This image displays a gradient of colors ranging from",
-        "The image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors transitioning from",
    ],
 ]

--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@ -1,153 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-import json
-from unittest.mock import MagicMock
-
-import pytest
-
-from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-
-
-def make_tool_call(name, arguments):
-    return ToolCall(type="function",
-                    function=FunctionCall(name=name,
-                                          arguments=json.dumps(arguments)))
-
-
-# TODO: add reason prefix and suffix.
-
-
-@pytest.mark.parametrize(
-    "model_output,expected_tool_calls,expected_content",
-    [
-        # No tool call
-        ("How can I help you today?", [], "How can I help you today?"),
-        # Single tool call, no content
-        (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}]</tool_calls>",  #noqa: E501
-            [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                })
-            ],
-            None),
-        # Multiple tool calls
-        (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}, {\"name\": \"register_user\", \"arguments\": {\"name\": \"John Doe\", \"age\": 37, \"address\": {\"city\": \"San Francisco\", \"state\": \"CA\"}, \"role\": null, \"passed_test\": true, \"aliases\": [\"John\", \"Johnny\"]}}]</tool_calls>",  #noqa: E501
-            [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                }),
-                make_tool_call(
-                    "register_user", {
-                        "name": "John Doe",
-                        "age": 37,
-                        "address": {
-                            "city": "San Francisco",
-                            "state": "CA"
-                        },
-                        "role": None,
-                        "passed_test": True,
-                        "aliases": ["John", "Johnny"]
-                    })
-            ],
-            None),
-        # Content before tool call
-        (
-            "I will call the tool now. <tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Boston\"}}]</tool_calls>",  #noqa: E501
-            [make_tool_call("get_weather", {"city": "Boston"})],
-            "I will call the tool now. "),
-        # Content after tool call (should be stripped)
-        (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Seattle\"}}]</tool_calls>\nThank you!",  #noqa: E501
-            [make_tool_call("get_weather", {"city": "Seattle"})],
-            None),
-        (
-            "<tool_calls>[{\"name\": \"complex_tool\", \"arguments\": {\"level1\": {\"level2\": {\"level3\": {\"value\": 123}}}}}]</tool_calls>",
-            [
-                make_tool_call(
-                    "complex_tool",
-                    {"level1": {
-                        "level2": {
-                            "level3": {
-                                "value": 123
-                            }
-                        }
-                    }})
-            ],
-            None,
-        ),
-    ])
-def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
-                                          expected_content):
-    mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=False)
-
-    # align the random id.
-    for idx in range(len(tool_calls)):
-        tool_calls[idx].id = expected_tool_calls[idx].id
-    assert tool_calls == expected_tool_calls
-    assert content == expected_content
-
-
-# Streaming test: simulate incremental output
-@pytest.mark.parametrize("model_deltas,expected_tool_calls", [
-    ([
-        "<tool_calls>[{\"name\": \"get_weather\", ",
-        "\"arguments\": {\"city\": \"San Francisco\", ",
-        "\"metric\": \"celsius\"}}]", "</tool_calls>"
-    ], [
-        make_tool_call("get_weather", {
-            "city": "San Francisco",
-            "metric": "celsius"
-        })
-    ]),
-    ([
-        "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    ([
-        "", "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>", "\n</answer>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    pytest.param([
-        "<tool_calls>[{\"name\": \"complex_tool\",", " \"arguments\": ",
-        " {\"level1\": {\"level2\": ", "{\"level3\": {\"value\": 123}}}}}",
-        "]</tool_calls>"
-    ], [
-        make_tool_call("complex_tool",
-                       {"level1": {
-                           "level2": {
-                               "level3": {
-                                   "value": 123
-                               }
-                           }
-                       }})
-    ],
-                 marks=pytest.mark.xfail(
-                     reason="stream parsing not support nested json yet.")),
-])
-def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
-    mock_tokenizer = MagicMock()
-
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
-    reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_deltas, assert_one_tool_per_delta=False)
-
-    # align the random id.
-    for idx in range(len(reconstructor.tool_calls)):
-        reconstructor.tool_calls[idx].id = expected_tool_calls[idx].id
-
-    assert reconstructor.tool_calls == expected_tool_calls
--- a/tests/kernels/moe/modular_kernel_tools/cli_args.py
+++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py
@ -85,6 +85,7 @@ def make_config_arg_parser(description: str):
                        help="num topk")
    parser.add_argument(
        "--fused-moe-chunk-size",
+        nargs="+",
        type=int,
        help="Fused moe chunk size used for the non-batched fused experts impl."
    )
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@ -416,7 +416,7 @@ class RankTensors:
        # We dequant and use that as hidden_states so the tests are stable.
        # quantizing and dequantizing yield slightly different results
        # depending on the hardware. Here we, quantize and dequantize
-        # first - so further quantize and dequantize will yield the same
+        # first - so further quantize and dequantize will yeild the same
        # values.
        if config.is_per_tensor_act_quant:
            a_q, a_scales = ops.scaled_fp8_quant(
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@ -25,7 +25,6 @@ MNK_FACTORS = [
    (2, 1024, 1536),
    (2, 3072, 1024),
    (2, 3072, 1536),
-    (7, 3072, 1536),
    (64, 1024, 1024),
    (64, 1024, 1536),
    (64, 3072, 1024),
@ -207,10 +206,6 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
        'topk_ids': topk_ids,
        'w1_scale': moe_tensors.w1_scale,
        'w2_scale': moe_tensors.w2_scale,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides1': moe_tensors.c_strides1,
-        'c_strides2': moe_tensors.c_strides2,
        'per_act_token': per_act_token,
        'a1_scale': None  #moe_tensors.a_scale
    }
@ -444,11 +439,6 @@ def test_run_cutlass_moe_fp8(
        expert_map[start:end] = list(range(num_local_experts))
        expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")

-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
        activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
                                                   torch.float8_e4m3fn,
@ -457,9 +447,8 @@ def test_run_cutlass_moe_fp8(
        func = lambda output: run_cutlass_moe_fp8(
            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
-            a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2,
-            workspace13, workspace2, None, mt.a.dtype, per_act_token,
-            per_out_channel, False)
+            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
+            per_act_token, per_out_channel, False)

        workspace13.random_()
        output_random_workspace = torch.empty(output_shape,
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)

-    # triton reference
+    # triton referrence
    out_triton = fused_experts(
        hidden_states=tokens_bf16,
        w1=w1,
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@ -75,7 +75,6 @@ def pplx_cutlass_moe(
    assert torch.cuda.current_device() == pgi.local_rank

    num_tokens, hidden_dim = a.shape
-    intermediate_dim = w2.shape[2]
    num_experts = w1.shape[0]
    block_size = hidden_dim  # TODO support more cases
    device = pgi.device
@ -124,31 +123,10 @@ def pplx_cutlass_moe(
        num_local_experts=num_local_experts,
        num_dispatchers=num_dispatchers)

-    ab_strides1 = torch.full((num_local_experts, ),
-                             hidden_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    ab_strides2 = torch.full((num_local_experts, ),
-                             intermediate_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides1 = torch.full((num_local_experts, ),
-                            2 * intermediate_dim,
-                            device="cuda",
-                            dtype=torch.int64)
-    c_strides2 = torch.full((num_local_experts, ),
-                            hidden_dim,
-                            device="cuda",
-                            dtype=torch.int64)
-
    experts = CutlassExpertsFp8(num_local_experts,
                                out_dtype,
                                per_act_token,
                                per_out_ch,
-                                ab_strides1,
-                                ab_strides2,
-                                c_strides1,
-                                c_strides2,
                                num_dispatchers=num_dispatchers,
                                use_batched_format=True)

--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations

-import numpy as np
+import importlib.util
+from array import array
+
 import openai
 import pytest
 from scipy.spatial.distance import cosine
@ -12,6 +14,10 @@ from vllm.config import ModelConfig

 from ....utils import RemoteOpenAIServer

+# GritLM embedding implementation is only supported by XFormers backend.
+pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                                reason="GritLM requires XFormers")
+
 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000

@ -20,11 +26,11 @@ def _arr(arr):
    """
    Convert a list of integers to an array of integers.
    """
-    return np.array(arr)
+    return array("i", arr)


 def test_find_array():
-    from vllm.model_executor.models.gritlm import GritLMMeanPool
+    from vllm.model_executor.models.gritlm import GritLMPooler

    model_config = ModelConfig(
        MODEL_NAME,
@ -35,19 +41,17 @@ def test_find_array():
        dtype="bfloat16",
        seed=0,
    )
-    pooling = GritLMMeanPool(model_config=model_config)
+    pooler = GritLMPooler(model_config=model_config)

    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

-    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
-    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
-    assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1

    with pytest.raises(ValueError):
-        pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)


 def run_llm_encode(
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@ -1,115 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-
-import pytest
-import pytest_asyncio
-from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
-
-from vllm.transformers_utils.tokenizer import MistralTokenizer
-
-from ....conftest import AudioTestAssets
-from ....utils import RemoteOpenAIServer
-from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
-
-MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
-MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
-]
-
-
-@pytest.fixture()
-def server(request, audio_assets: AudioTestAssets):
-    args = [
-        "--enforce-eager",
-        "--limit-mm-per-prompt",
-        json.dumps({"audio": len(audio_assets)}),
-    ] + MISTRAL_FORMAT_ARGS
-
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-def _get_prompt(audio_assets, question):
-    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
-
-    audios = [
-        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
-        for i in range(len(audio_assets))
-    ]
-    audio_chunks = [
-        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
-    ]
-
-    text_chunk = TextChunk(text=question)
-    messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
-
-    return tokenizer.apply_chat_template(messages=messages)
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
-    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
-    run_multi_audio_test(
-        vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
-        MODEL_NAME,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tokenizer_mode="mistral",
-    )
-
-
-@pytest.mark.asyncio
-async def test_online_serving(client, audio_assets: AudioTestAssets):
-    """Exercises online serving with/without chunked prefill enabled."""
-
-    def asset_to_chunk(asset):
-        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
-        audio.format = "wav"
-        audio_dict = AudioChunk.from_audio(audio).to_openai()
-        return audio_dict
-
-    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *audio_chunks,
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
-
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
-
-    assert len(chat_completion.choices) == 1
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -291,7 +291,6 @@ def _test_processing_correctness_one(
    "allenai/Molmo-7B-D-0924",
    "allenai/Molmo-7B-O-0924",
    "nvidia/NVLM-D-72B",
-    "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
    "AIDC-AI/Ovis1.6-Gemma2-9B",
    "AIDC-AI/Ovis1.6-Llama3.2-3B",
    "AIDC-AI/Ovis2-1B",
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@ -1,134 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
-from collections.abc import Mapping
-from typing import Optional
-
-import pytest
-from PIL import Image
-from transformers import PretrainedConfig
-
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.processing import BaseMultiModalProcessor
-
-from ....conftest import ImageTestAssets
-from ...utils import build_model_context
-
-
-def _get_expected_num_patches(
-    config: PretrainedConfig,
-    image: Image.Image,
-    num_imgs: int,
-    min_num: int,
-    max_num: int,
-):
-    from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
-
-    width, height = image.size
-
-    blocks, _, _ = calculate_internvl_targets(
-        orig_width=width,
-        orig_height=height,
-        target_ratios=get_internvl_target_ratios(
-            min_num,
-            max_num,
-        ),
-        image_size=config.force_image_size,
-        use_thumbnail=False,
-    )
-    expected_num_patches = blocks
-
-    if config.use_thumbnail and expected_num_patches > 1:
-        expected_num_patches += 1
-
-    return expected_num_patches
-
-
-def _run_check(
-    processor: BaseMultiModalProcessor,
-    images: list[Image.Image],
-    min_num: int,
-    max_num: int,
-    mm_processor_kwargs: Mapping[str, object],
-):
-    tokenizer = processor.info.get_tokenizer()
-    config = processor.info.get_hf_config()
-    image_processor = processor.info.get_image_processor()
-
-    config.use_thumbnail = image_processor.use_thumbnail
-    prompt = "<image>" * len(images)
-    mm_data = {"image": images}
-
-    total_expected_num_patches = sum(
-        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
-    print(total_expected_num_patches)
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-
-    # Ensure we have the right number of placeholders per num_crops size
-    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
-    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
-    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
-    assert img_tok_count == 256 * total_expected_num_patches
-    assert pixel_shape[0] == total_expected_num_patches
-
-
-@pytest.mark.parametrize("model_id",
-                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-        [4.0, 2.0, 1.0],
-    ],
-)
-@pytest.mark.parametrize(
-    ("min_dynamic_patch", "max_dynamic_patch"),
-    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
-)
-@pytest.mark.parametrize("dynamic_image_size", [True, False])
-@pytest.mark.parametrize("kwargs_on_init", [True, False])
-def test_processor_override(
-    model_id: str,
-    image_assets: ImageTestAssets,
-    size_factors: list[int],
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
-    kwargs_on_init: bool,
-):
-    mm_processor_kwargs = {
-        "min_dynamic_patch": min_dynamic_patch,
-        "max_dynamic_patch": max_dynamic_patch,
-        "dynamic_image_size": dynamic_image_size,
-    }
-
-    ctx = build_model_context(
-        model_id,
-        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
-        limit_mm_per_prompt={"image": len(size_factors)},
-    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
-    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
-
-    min_num = min_dynamic_patch if dynamic_image_size else 1
-    max_num = max_dynamic_patch if dynamic_image_size else 1
-
-    _run_check(
-        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
-        min_num,
-        max_num,
-        hf_processor_mm_kwargs,
-    )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -265,6 +265,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
@ -291,6 +292,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
@ -309,6 +311,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
+    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
@ -321,29 +324,20 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                            is_available_online=False),  # noqa: E501
 }

-_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
-    # [Decoder-only]
-    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
-
-    # [Cross-encoder]
+_CROSS_ENCODER_EXAMPLE_MODELS = {
+    # [Text-only]
    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501
+                                                      v0_only=True,
+                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
+                                                                    "classifier_from_token": ["Yes"], # noqa: E501
+                                                                    "method": "no_post_processing"}), # noqa: E501
+    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"), # noqa: E501
    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
 }

-_AUTOMATIC_CONVERTED_MODELS = {
-    # Use as_seq_cls_model for automatic conversion
-    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      v0_only=True,
-                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
-                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
-                                                                    "method": "no_post_processing"}),  # noqa: E501
-    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"),  # noqa: E501
-    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
-    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
-}
-
 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
@ -407,8 +401,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        trust_remote_code=True),
    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
                              trust_remote_code=True),
-    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
-                                                     trust_remote_code=True),
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@ -448,14 +440,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
                                                         trust_remote_code=True),  # noqa: E501
    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
-    "VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501

    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501
 }

-
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
                                  speculative_model="abhigoyal/vllm-eagle-llama-68m-random"),  # noqa: E501
@ -474,11 +464,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                            trust_remote_code=True,
                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
-    "EagleLlama4ForCausalLM": _HfExamplesInfo(
-        "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
-        trust_remote_code=True,
-        speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
-        tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"),  # noqa: E501
    "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
                                            trust_remote_code=True,
                                            is_available_online=False,
@ -496,7 +481,7 @@ _TRANSFORMERS_MODELS = {
 _EXAMPLE_MODELS = {
    **_TEXT_GENERATION_EXAMPLE_MODELS,
    **_EMBEDDING_EXAMPLE_MODELS,
-    **_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
+    **_CROSS_ENCODER_EXAMPLE_MODELS,
    **_MULTIMODAL_EXAMPLE_MODELS,
    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
    **_TRANSFORMERS_MODELS,
@ -528,5 +513,4 @@ class HfExampleModels:
        raise ValueError(f"No example model defined for {model_id}")


-HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
-AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERTED_MODELS)
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -13,21 +13,20 @@ from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore

 from ..utils import create_new_process_for_each_test
-from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+from .registry import HF_EXAMPLE_MODELS


+@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
@create_new_process_for_each_test()
-def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
-                   EXAMPLE_MODELS: HfExampleModels):
-    """The reason for using create_new_process_for_each_test is to avoid
-    the WARNING:
-        "We must use the 'spawn' multiprocessing start method. Overriding
+def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+    """The reason for using create_new_process_for_each_test is to avoid 
+    the WARNING: 
+        "We must use the 'spawn' multiprocessing start method. Overriding 
        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
-    The spawn process causes the _initialize_kv_caches_v1 function below to
+    The spawn process causes the _initialize_kv_caches_v1 function below to 
    become ineffective.
    """
-
-    model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

@ -37,11 +36,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")

-    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
-        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
-        from vllm.model_executor.models.registry import ModelRegistry
-        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
-
    # Avoid OOM and reduce initialization time by only using 1 layer
    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
        hf_config.update(model_info.hf_overrides)
@ -49,7 +43,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
        text_config = hf_config.get_text_config()

        # Ensure at least 2 expert per group
-        # Since `grouped_topk` assumes top-2
+        # Since `grouped_topk` assums top-2
        n_group = getattr(text_config, 'n_group', None)
        num_experts = n_group * 2 if n_group is not None else 2

@ -128,15 +122,3 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            load_format="dummy",
            hf_overrides=hf_overrides,
        )
-
-
-@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
-    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
-
-
-@pytest.mark.parametrize("model_arch",
-                         AUTO_EXAMPLE_MODELS.get_supported_archs())
-def test_implicit_converted_models(model_arch: str,
-                                   monkeypatch: pytest.MonkeyPatch):
-    can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@ -138,38 +138,3 @@ def test_quantization(
        name_0="transformers",
        name_1="vllm",
    )
-
-
-@pytest.mark.parametrize(
-    "model",
-    ["jason9693/Qwen2.5-1.5B-apeach"],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_classify(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    monkeypatch,
-) -> None:
-    import torch
-    from transformers import AutoModelForSequenceClassification
-
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     model_impl="transformers") as vllm_model:
-        vllm_outputs = vllm_model.classify(example_prompts)
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
-        hf_outputs = hf_model.classify(example_prompts)
-
-    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output)
-        vllm_output = torch.tensor(vllm_output)
-
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@ -11,13 +11,11 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.models.gemma2 import Gemma2Model
 from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
-from vllm.sequence import IntermediateTensors
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput


 class MyGemma2Embedding(nn.Module):
-
-    is_pooling_model = True
-
    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@ -26,7 +24,7 @@ class MyGemma2Embedding(nn.Module):
        self.model = Gemma2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))

-        self.pooler = Pooler.from_config_with_defaults(
+        self._pooler = Pooler.from_config_with_defaults(
            vllm_config.model_config.pooler_config,
            pooling_type=PoolingType.LAST,
            normalize=True,
@ -56,6 +54,13 @@ class MyGemma2Embedding(nn.Module):
        # Return all-zero embeddings
        return torch.zeros_like(hidden_states)

+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):

        weights = self.hf_to_vllm_mapper.apply(weights)
--- a/tests/reasoning/test_hunyuan_reasoning_parser.py
+++ b/tests/reasoning/test_hunyuan_reasoning_parser.py
@ -30,12 +30,6 @@ COMPLETE_REASONING = {
    "reasoning_content": "This is a reasoning section",
    "content": None,
 }
-
-COMPLETE_REASONING_WITH_SYMBOL = {
-    "output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
-    "reasoning_content": "This is a reasoning section!",
-    "content": None,
-}
 NO_REASONING = {
    "output": "This is content",
    "reasoning_content": None,
@ -76,11 +70,6 @@ TEST_CASES = [
        COMPLETE_REASONING,
        id="complete_reasoning",
    ),
-    pytest.param(
-        False,
-        COMPLETE_REASONING_WITH_SYMBOL,
-        id="complete_reasoning_with_symbol",
-    ),
    pytest.param(
        False,
        NO_REASONING,
--- a/tests/tokenization/test_do_lower_case.py
+++ b/tests/tokenization/test_do_lower_case.py
@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = ["BAAI/bge-base-en"]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-@pytest.mark.parametrize("n_tokens", [510])
-def test_special_tokens(tokenizer_name: str, n_tokens: int):
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-
-    prompts = '[UNK]' * n_tokens
-    prompt_token_ids = tokenizer.encode(prompts)
-    assert len(prompt_token_ids) == n_tokens + 2
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@ -1,466 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for v1 attention backends without GPUModelRunner dependency."""
-
-import pytest
-import torch
-
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config,
-                                      get_attention_backend)
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
-    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1
-]
-
-# Remove flashinfer from the list if it's not available
-try:
-    import flashinfer  # noqa: F401
-except ImportError:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
-
-
-def _convert_dtype_to_torch(dtype):
-    """Convert ModelDType to torch.dtype."""
-    if isinstance(dtype, str):
-        if dtype == "auto":
-            return torch.float16  # Default dtype for testing
-        elif dtype in STR_DTYPE_TO_TORCH_DTYPE:
-            return STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        else:
-            raise ValueError(f"Unknown dtype: {dtype}")
-    elif isinstance(dtype, torch.dtype):
-        return dtype
-    else:
-        raise ValueError(f"Unknown dtype: {dtype}")
-
-
-# Define common batch configurations
-BATCH_SPECS = {
-    "small_decode":
-    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
-    "small_prefill":
-    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
-    "mixed_small":
-    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
-    "medium_decode":
-    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
-              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
-    "medium_prefill":
-    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
-    "mixed_medium":
-    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
-              query_lens=[1, 1, 1, 7, 7, 7]),
-    "large_decode":
-    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
-    "large_prefill":
-    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
-    "single_decode":
-    BatchSpec(seq_lens=[1024], query_lens=[1]),
-    "single_prefill":
-    BatchSpec(seq_lens=[1024], query_lens=[64]),
-}
-
-
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        2,  # K and V
-        num_blocks,
-        kv_cache_spec.block_size,
-        kv_cache_spec.num_kv_heads,
-        kv_cache_spec.head_size,
-        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
-        device=device,
-    )
-    return kv_cache
-
-
-def create_and_prepopulate_kv_cache(
-        k_contexts: list[torch.Tensor],
-        v_contexts: list[torch.Tensor],
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        num_blocks: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True) -> torch.Tensor:
-    """Create and prepopulate a KV cache with context data.
-    
-    Args:
-        k_contexts: List of key context tensors for each sequence
-        v_contexts: List of value context tensors for each sequence
-        seq_lens: List of sequence lengths
-        block_size: Size of each block
-        num_kv_heads: Number of KV heads
-        head_size: Size of each head
-        dtype: Data type for the cache
-        device: Device to create the cache on
-        num_blocks: Total number of blocks in the cache
-        block_table: Block table tensor to populate
-        randomize_blocks: Whether to randomly permute blocks 
-                          or use sequential order
-        
-    Returns:
-        Tuple of (kv_cache, updated_block_table)
-    """
-    batch_size = len(k_contexts)
-    seq_lens = common_attn_metadata.seq_lens_cpu
-    query_lens = common_attn_metadata.query_start_loc_cpu[
-        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
-    context_lens = common_attn_metadata.num_computed_tokens_cpu
-    block_table = common_attn_metadata.block_table_tensor
-    slot_mapping = common_attn_metadata.slot_mapping
-
-    # Create KV cache
-    kv_cache = torch.empty(2,
-                           num_blocks,
-                           block_size,
-                           num_kv_heads,
-                           head_size,
-                           dtype=dtype,
-                           device=device)
-    kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
-
-    # Populate the cache with the context tokens
-    # Start from block_id=1 since block_id=0 is considered the null block
-    start_block_idx = 1
-    for i in range(batch_size):
-        k_context, v_context = k_contexts[i], v_contexts[i]
-        start = start_block_idx * block_size
-        end = start + k_context.shape[0]
-        kv_cache_flat[0, start:end, ...] = k_context
-        kv_cache_flat[1, start:end, ...] = v_context
-
-        # Stay block aligned and allocate enough blocks for the new tokens
-        start_block_idx += cdiv(int(seq_lens[i]), block_size)
-
-    blocks_end = start_block_idx
-
-    # Permute the context blocks (excluding block 0 which is null)
-    if randomize_blocks:
-        perm = torch.randperm(
-            blocks_end - 1) + 1  # Random permutation starting from block 1
-    else:
-        perm = torch.arange(
-            1, blocks_end)  # Sequential order starting from block 1
-
-    inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
-    inv_perm[1:] = torch.argsort(
-        perm) + 1  # Add 1 to account for starting from block 1
-    kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...]
-
-    # Construct the right block table
-    # Start from block_id=1 since block_id=0 is considered the null block
-    start_block_idx = 1
-    for i in range(batch_size):
-        num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size)
-        start = start_block_idx
-        end = start + num_blocks_for_seq
-        block_table[i, :num_blocks_for_seq] = inv_perm[start:end]
-        start_block_idx += num_blocks_for_seq
-
-        # Create a realistic slot mapping that corresponds to the block table
-    for i in range(batch_size):
-        token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i])
-        block_indices = token_offsets // block_size
-        token_inter_block_offsets = token_offsets % block_size
-        start = common_attn_metadata.query_start_loc_cpu[i]
-        end = common_attn_metadata.query_start_loc_cpu[i + 1]
-        slot_mapping[start:end] = block_table[
-            i,
-            block_indices] * block_size + token_inter_block_offsets.to(device)
-
-    return kv_cache
-
-
-class MockAttentionLayer:
-    """A mock attention layer for testing."""
-
-    def __init__(self, device: torch.device):
-        self._q_scale = torch.tensor(1.0, device=device)
-        self._k_scale = torch.tensor(1.0, device=device)
-        self._v_scale = torch.tensor(1.0, device=device)
-        # Add float versions for flashinfer
-        self._k_scale_float = 1.0
-        self._v_scale_float = 1.0
-
-
-def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          vllm_config, device: torch.device,
-                          common_attn_metadata: CommonAttentionMetadata,
-                          query: torch.Tensor, key: torch.Tensor,
-                          value: torch.Tensor,
-                          kv_cache: torch.Tensor) -> torch.Tensor:
-    """Run attention computation using the specified backend's AttentionImpl."""
-
-    builder_cls, impl_cls = get_attention_backend(backend)
-
-    # Mock flashinfer's get_per_layer_parameters if needed
-    if backend == _Backend.FLASHINFER_VLLM_V1:
-        import unittest.mock
-
-        from vllm.v1.attention.backends.flashinfer import PerLayerParameters
-
-        def mock_get_per_layer_parameters(vllm_config):
-            # Return mock parameters for a single layer
-            head_size = vllm_config.model_config.get_head_size()
-            return {
-                "mock_layer":
-                PerLayerParameters(
-                    window_left=-1,  # No sliding window
-                    logits_soft_cap=0.0,  # No soft cap
-                    sm_scale=1.0 / (head_size**0.5)  # Standard scale
-                )
-            }
-
-        with unittest.mock.patch(
-                'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
-                mock_get_per_layer_parameters):
-            builder = builder_cls(kv_cache_spec, vllm_config, device)
-            attn_metadata = builder.build(
-                common_prefix_len=0,
-                common_attn_metadata=common_attn_metadata,
-            )
-    else:
-        # Build metadata
-        builder = builder_cls(kv_cache_spec, vllm_config, device)
-        attn_metadata = builder.build(
-            common_prefix_len=0,
-            common_attn_metadata=common_attn_metadata,
-        )
-
-    # Instantiate implementation
-    num_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
-    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
-    head_size = vllm_config.model_config.get_head_size()
-    scale = 1.0 / (head_size**0.5)
-    impl = impl_cls(
-        num_heads=num_heads,
-        head_size=head_size,
-        scale=scale,
-        num_kv_heads=num_kv_heads,
-        alibi_slopes=None,
-        sliding_window=None,
-        kv_cache_dtype="auto",
-    )
-
-    # Create mock layer and output buffer
-    mock_layer = MockAttentionLayer(device)
-    output = torch.empty_like(query)
-
-    # Run forward pass
-    # NOTE: The query, key, and value are already shaped correctly
-    # in the calling test function.
-    output = impl.forward(mock_layer,
-                          query,
-                          key,
-                          value,
-                          kv_cache,
-                          attn_metadata,
-                          output=output)
-
-    return output
-
-
-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium"
-])
-@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_backend_correctness(batch_spec_name: str, model: str):
-    """
-    Test that all backends produce similar outputs to a reference implementation
-    using torch.nn.functional.scaled_dot_product_attention.
-
-    This test works by:
-    1. Generating a batch of sequences with specified context and query lengths.
-    2. Computing a ground-truth attention output using torch.sdpa on
-       contiguous Q, K, and V tensors.
-    3. Simulating vLLM's paged KV cache: It takes the context portion of the
-       K/V tensors and manually places them into a paged buffer according to
-       the test's (randomly generated) block table.
-    4. Running each vLLM attention backend with the new queries and the
-       simulated paged KV cache.
-    5. Comparing the vLLM backend's output to the ground-truth SDPA output.
-    """
-    batch_spec = BATCH_SPECS[batch_spec_name]
-    vllm_config = create_vllm_config(model_name=model)
-    device = torch.device("cuda:0")
-
-    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
-
-    # 1. Setup
-    batch_size = batch_spec.batch_size
-    seq_lens = batch_spec.seq_lens
-    query_lens = batch_spec.query_lens
-    num_q_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
-    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
-    head_size = vllm_config.model_config.get_head_size()
-    dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
-    block_size = vllm_config.cache_config.block_size
-    scale = 1.0 / (head_size**0.5)
-
-    # 2. Generate data and compute SDPA reference output
-    all_q_vllm, all_k_vllm, all_v_vllm = [], [], []
-    all_sdpa_outputs = []
-    k_contexts, v_contexts = [], []
-
-    for i in range(batch_size):
-        s_len = seq_lens[i]
-        q_len = query_lens[i]
-        context_len = s_len - q_len
-
-        # Generate Q, K, V for the whole sequence to be used in SDPA
-        q = torch.randn(q_len,
-                        num_q_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=device)
-        k_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
-        v_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
-
-        # SDPA expects (N, H, L, D), so unsqueeze batch and permute
-        q_sdpa_in = q.unsqueeze(0).transpose(1, 2)
-        k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
-        v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
-
-        if num_q_heads != num_kv_heads:
-            assert num_q_heads % num_kv_heads == 0, (
-                f"num_q_heads ({num_q_heads}) must be divisible by "
-                f"num_kv_heads ({num_kv_heads})")
-            repeats = num_q_heads // num_kv_heads
-            k_sdpa_in = k_sdpa_in.repeat_interleave(repeats, dim=1)
-            v_sdpa_in = v_sdpa_in.repeat_interleave(repeats, dim=1)
-
-        # Create causal mask: query token i attends to positions 0 to
-        #  (context_len + i)
-        kv_len = s_len
-        offset = context_len
-        attn_mask = torch.full((q_len, kv_len),
-                               float('-inf'),
-                               device=device,
-                               dtype=dtype)
-        for i in range(q_len):
-            attn_mask[i, :offset + i + 1] = 0.0
-
-        sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in,
-            k_sdpa_in,
-            v_sdpa_in,
-            attn_mask=attn_mask,
-            scale=scale,
-            enable_gqa=True)
-        # Convert back to (L, H, D)
-        all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
-
-        # Inputs for vLLM backends are just the new tokens
-        all_q_vllm.append(q)
-        all_k_vllm.append(k_full[context_len:])
-        all_v_vllm.append(v_full[context_len:])
-
-        # Contextual K/V data used to populate the paged cache
-        k_contexts.append(k_full[:context_len])
-        v_contexts.append(v_full[:context_len])
-
-    query_vllm = torch.cat(all_q_vllm, dim=0)
-    key_vllm = torch.cat(all_k_vllm, dim=0)
-    value_vllm = torch.cat(all_v_vllm, dim=0)
-    sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
-
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec, vllm_config.cache_config.block_size, device)
-
-    # 3. Simulate Paged KV Cache and a realistic slot_mapping
-    kv_cache = create_and_prepopulate_kv_cache(
-        k_contexts=k_contexts,
-        v_contexts=v_contexts,
-        block_size=block_size,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        dtype=dtype,
-        device=device,
-        num_blocks=vllm_config.cache_config.num_gpu_blocks or 1000,
-        common_attn_metadata=common_attn_metadata,
-        randomize_blocks=True)
-
-    # 4. Run vLLM backends and compare
-    # Note: flex_attention has known Triton kernel compatibility issues
-    # with test infrastructures
-    for backend_name in BACKENDS_TO_TEST:
-        # FlashAttentionm + FlexAttention:
-        #   [2, num_blocks, block_size, num_kv_heads, head_size]
-        # FlashInfer:
-        #   [num_blocks, 2, block_size, num_kv_heads, head_size]
-        # Select the appropriate KV cache format for each backend
-        kv_cache_for_backend = kv_cache
-        if backend_name == _Backend.FLASHINFER_VLLM_V1:
-            kv_cache_for_backend = kv_cache.transpose(0, 1)
-
-        backend_output = run_attention_backend(backend_name, kv_cache_spec,
-                                               vllm_config, device,
-                                               common_attn_metadata,
-                                               query_vllm, key_vllm,
-                                               value_vllm,
-                                               kv_cache_for_backend)
-
-        # Check shape and dtype consistency
-        assert backend_output.shape == sdpa_output.shape, (
-            f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_output.shape}")
-        assert backend_output.dtype == sdpa_output.dtype, (
-            f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_output.dtype}")
-
-        assert torch.isfinite(backend_output).all(), (
-            f"[{backend_name}] produced non-finite values")
-
-        # Check numerical similarity
-        rtol = 1e-2
-        atol = 5e-3
-
-        if backend_name == _Backend.FLEX_ATTENTION:
-            atol = 5e-1  # TODO: figure out why flex_attention has such large
-            # numerical differences for medium_decode, medium_prefill,
-            # mixed_medium
-
-        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
-        max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_output) /
-            torch.abs(sdpa_output)).item()
-        all_close = torch.allclose(backend_output,
-                                   sdpa_output,
-                                   rtol=rtol,
-                                   atol=atol)
-
-        if not all_close:
-            print(f"[{backend_name}] output differs from SDPA baseline. "
-                  f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
-            print(f"[{backend_name}] output: {backend_output}")
-            print(f"[{backend_name}] SDPA baseline: {sdpa_output}")
-
-        assert all_close, (
-            f"[{backend_name}] output differs from SDPA baseline. "
-            f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -1,229 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utility functions for attention-related v1 tests."""
-
-from dataclasses import dataclass
-from typing import Union
-
-import pytest
-import torch
-
-from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
-                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
-from vllm.platforms import _Backend
-from vllm.utils import resolve_obj_by_qualname
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
-from vllm.v1.kv_cache_interface import FullAttentionSpec
-
-
-@dataclass
-class BatchSpec:
-    """Specification for a batch configuration (workload shape only)."""
-    seq_lens: list[int]
-    query_lens: list[int]
-
-    name: str = "unnamed"
-
-    @property
-    def batch_size(self):
-        return len(self.seq_lens)
-
-    def __post_init__(self):
-        assert len(self.seq_lens) == len(self.query_lens)
-
-    def compute_num_tokens(self):
-        return sum(self.query_lens)
-
-
-def create_common_attn_metadata(
-        batch_spec: BatchSpec,
-        block_size: int,
-        device: torch.device,
-        max_block_idx: int = 1000) -> CommonAttentionMetadata:
-    """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
-    # Create query start locations
-    query_start_loc = torch.zeros(batch_spec.batch_size + 1,
-                                  dtype=torch.int32,
-                                  device=device)
-    query_start_loc[1:] = torch.tensor(batch_spec.query_lens,
-                                       dtype=torch.int32,
-                                       device=device).cumsum(0)
-    query_start_loc_cpu = query_start_loc.cpu()
-    num_tokens = batch_spec.compute_num_tokens()
-
-    # Create sequence lengths
-    seq_lens = torch.tensor(batch_spec.seq_lens,
-                            dtype=torch.int32,
-                            device=device)
-    seq_lens_cpu = seq_lens.cpu()
-
-    # Create computed tokens (context length for each sequence)
-    context_lens = [
-        batch_spec.seq_lens[i] - batch_spec.query_lens[i]
-        for i in range(batch_spec.batch_size)
-    ]
-    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
-
-    # Create block table (random for testing)
-    max_blocks = max(batch_spec.seq_lens) // block_size + 1
-    block_table_tensor = torch.randint(0,
-                                       max_block_idx,
-                                       (batch_spec.batch_size, max_blocks),
-                                       dtype=torch.int32,
-                                       device=device)
-
-    # Create slot mapping
-    slot_mapping = torch.randint(0,
-                                 max_block_idx, (num_tokens, ),
-                                 dtype=torch.int64,
-                                 device=device)
-
-    # Calculate max query length
-    max_query_len = max(batch_spec.query_lens)
-
-    return CommonAttentionMetadata(
-        query_start_loc=query_start_loc,
-        query_start_loc_cpu=query_start_loc_cpu,
-        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
-        num_reqs=batch_spec.batch_size,
-        num_actual_tokens=num_tokens,
-        max_query_len=max_query_len,
-        block_table_tensor=block_table_tensor,
-        slot_mapping=slot_mapping,
-    )
-
-
-def get_attention_backend(backend_name: _Backend):
-    """Set up attention backend classes for testing.
-    
-    Args:
-        backend_name: Name of the backend ("flash_attn", "flashinfer", etc.)
-        vllm_config: VllmConfig instance
-        
-    Returns:
-        Tuple of (backend_builder_class, backend_impl_class)
-    """
-    backend_map = {
-        _Backend.FLASH_ATTN_VLLM_V1:
-        "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
-        _Backend.FLASHINFER_VLLM_V1:
-        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION:
-        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN_VLLM_V1:
-        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
-    }
-
-    if backend_name not in backend_map:
-        raise ValueError(f"Unknown backend: {backend_name}")
-
-    backend_class_name = backend_map[backend_name]
-
-    try:
-        backend_class = resolve_obj_by_qualname(backend_class_name)
-        return backend_class.get_builder_cls(), backend_class.get_impl_cls()
-    except ImportError as e:
-        pytest.skip(f"{backend_name} not available: {e}")
-
-
-def create_standard_kv_cache_spec(
-        vllm_config: VllmConfig) -> FullAttentionSpec:
-    """Create a FullAttentionSpec from ModelParams only."""
-    return FullAttentionSpec(
-        block_size=vllm_config.cache_config.block_size,
-        num_kv_heads=vllm_config.model_config.get_num_kv_heads(
-            vllm_config.parallel_config),
-        head_size=vllm_config.model_config.get_head_size(),
-        dtype=vllm_config.model_config.dtype,
-        use_mla=vllm_config.model_config.use_mla,
-        sliding_window=vllm_config.model_config.get_sliding_window(),
-    )
-
-
-def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
-                       tensor_parallel_size: int = 1,
-                       max_model_len: int = 1024,
-                       dtype: Union[ModelDType, torch.dtype] = "auto",
-                       block_size: int = 16,
-                       max_num_seqs: int = 256,
-                       max_num_batched_tokens: int = 8192,
-                       add_mock_model_methods: bool = True) -> VllmConfig:
-    """Create a VllmConfig for testing with reasonable defaults."""
-
-    model_config = ModelConfig(
-        model=model_name,
-        tokenizer=model_name,
-        trust_remote_code=False,
-        dtype=dtype,
-        seed=0,
-        max_model_len=max_model_len,
-    )
-
-    cache_config = CacheConfig(
-        block_size=block_size,
-        cache_dtype="auto",
-        swap_space=0,
-    )
-    # Set cache blocks for testing
-    #   (these may be set during initialization normally)
-    cache_config.num_gpu_blocks = 1000
-    cache_config.num_cpu_blocks = 0
-
-    parallel_config = ParallelConfig(
-        tensor_parallel_size=tensor_parallel_size, )
-
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-    )
-
-    device_config = DeviceConfig()
-    load_config = LoadConfig()
-    compilation_config = CompilationConfig()
-
-    if add_mock_model_methods:
-        # Add mock methods to satisfy backends that need them
-        # This is a workaround because tests don't build full, real models,
-        # but some backends expect to query the model for layer-specific
-        # parameters
-        import types
-        model_config.get_num_layers = types.MethodType(lambda self: 1,
-                                                       model_config)
-        model_config.get_sliding_window_for_layer = types.MethodType(
-            lambda self, i: None, model_config)
-        model_config.get_logits_soft_cap_for_layer = types.MethodType(
-            lambda self, i: 0.0, model_config)
-        model_config.get_sm_scale_for_layer = types.MethodType(
-            lambda self, i: 1.0 / model_config.get_head_size()**0.5,
-            model_config)
-
-    return VllmConfig(
-        model_config=model_config,
-        cache_config=cache_config,
-        parallel_config=parallel_config,
-        scheduler_config=scheduler_config,
-        device_config=device_config,
-        load_config=load_config,
-        compilation_config=compilation_config,
-    )
-
-
-def create_dummy_kv_cache(block_size: int,
-                          num_kv_heads: int,
-                          head_size: int,
-                          dtype: torch.dtype,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        num_blocks,
-        2,  # K and V
-        block_size,
-        num_kv_heads,
-        head_size,
-        dtype=dtype,
-        device=device)
-    return kv_cache
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -6,10 +6,8 @@ import random
 from typing import Any

 import pytest
-import torch

 from vllm import LLM, SamplingParams
-from vllm.distributed import cleanup_dist_env_and_memory


@pytest.fixture
@ -55,6 +53,14 @@ def model_name():
    return "meta-llama/Llama-3.1-8B-Instruct"


+def eagle_model_name():
+    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+
+def eagle3_model_name():
+    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+
+
 def test_ngram_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
@ -71,8 +77,6 @@ def test_ngram_correctness(
        ref_llm = LLM(model=model_name, max_model_len=1024)
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()

        spec_llm = LLM(
            model=model_name,
@ -99,50 +103,34 @@ def test_ngram_correctness(
        # Upon failure, inspect the outputs to check for inaccuracy.
        assert matches > int(0.7 * len(ref_outputs))
        del spec_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()


-@pytest.mark.parametrize("model_setup", [
-    ("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
-    ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-],
-                         ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
-    model_setup: tuple[str, str, str, int],
+    model_name: str,
+    use_eagle3: bool,
 ):
    '''
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using eagle speculative decoding.
-    model_setup: (method, model_name, eagle_model_name, tp_size)
    '''
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        method, model_name, spec_model_name, tp_size = model_setup

-        ref_llm = LLM(model=model_name,
-                      max_model_len=2048,
-                      tensor_parallel_size=tp_size)
+        ref_llm = LLM(model=model_name, max_model_len=2048)
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()

+        spec_model_name = eagle3_model_name(
+        ) if use_eagle3 else eagle_model_name()
        spec_llm = LLM(
            model=model_name,
            trust_remote_code=True,
-            tensor_parallel_size=tp_size,
            speculative_config={
-                "method": method,
+                "method": "eagle3" if use_eagle3 else "eagle",
                "model": spec_model_name,
                "num_speculative_tokens": 3,
                "max_model_len": 2048,
@ -164,5 +152,3 @@ def test_eagle_correctness(
        # Upon failure, inspect the outputs to check for inaccuracy.
        assert matches > int(0.66 * len(ref_outputs))
        del spec_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@ -1,166 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-
-import openai
-import pytest
-import pytest_asyncio
-
-from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64, fetch_image
-
-# Use a small vision model for testing
-MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
-MAXIMUM_IMAGES = 2
-# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
-]
-
-
-@pytest.fixture(scope="module")
-def default_image_server_args():
-    return [
-        "--enforce-eager",
-        "--max-model-len",
-        "6000",
-        "--max-num-seqs",
-        "128",
-        "--limit-mm-per-prompt",
-        json.dumps({"image": MAXIMUM_IMAGES}),
-    ]
-
-
-@pytest.fixture(scope="module")
-def image_server(default_image_server_args):
-    with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_server_args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(image_server):
-    async with image_server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
-    return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
-    }
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(client: openai.AsyncOpenAI,
-                                         model_name: str, image_url: str):
-    content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
-
-    # test image url
-    response = await client.responses.create(
-        model=model_name,
-        input=messages,
-    )
-    assert len(response.output_text) > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image_base64encoded(
-    client: openai.AsyncOpenAI,
-    model_name: str,
-    image_url: str,
-    base64_encoded_image: dict[str, str],
-):
-    content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url":
-                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
-    # test image base64
-    response = await client.responses.create(
-        model=model_name,
-        input=messages,
-    )
-    assert len(response.output_text) > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize(
-    "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
-async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            } for image_url in image_urls),
-            {
-                "type": "input_text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
-
-    if len(image_urls) > MAXIMUM_IMAGES:
-        with pytest.raises(openai.BadRequestError):  # test multi-image input
-            await client.responses.create(
-                model=model_name,
-                input=messages,
-            )
-        # the server should still work afterwards
-        response = await client.responses.create(
-            model=model_name,
-            input=[{
-                "role": "user",
-                "content": "What's the weather like in Paris today?",
-            }],
-        )
-        assert len(response.output_text) > 0
-    else:
-        response = await client.responses.create(
-            model=model_name,
-            input=messages,
-        )
-        assert len(response.output_text) > 0
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@ -7,7 +7,6 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import regex as re
-import requests
 from openai import BadRequestError

 from tests.utils import RemoteOpenAIServer
@ -27,8 +26,7 @@ def default_server_args():
        "2048",
        "--max-num-seqs",
        "128",
-        "--enforce-eager",
-        "--enable-prompt-tokens-details",
+        "--enforce-eager"
    ]


@ -681,17 +679,3 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
            prompt=prompt,
            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
        )
-
-
-@pytest.mark.asyncio
-async def test_completion_with_empty_prompt_embeds(
-        client: openai.AsyncOpenAI) -> None:
-    """Test completion with empty prompt embeds."""
-    payload: dict[str, list] = {"prompt_embeds": []}
-    headers: dict[str, str] = {"Content-Type": "application/json"}
-    # base_url = http://localhost:8000/v1/completions
-    response = requests.post(f"{client.base_url}completions",
-                             headers=headers,
-                             json=payload)
-    assert response.status_code == 200, (
-        f"Expected status code 200, got {response.status_code}. ")
--- a/tests/v1/executor/test_multiproc_executor.py
+++ b/tests/v1/executor/test_multiproc_executor.py
@ -1,127 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import threading
-from collections import defaultdict
-from concurrent.futures import Future
-from typing import Optional
-
-from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-from vllm.v1.outputs import ModelRunnerOutput
-
-
-class DummyMultiprocExecutor(MultiprocExecutor):
-
-    def __init__(self, output_rank, world_size):
-        # Manually initialize minimal required fields
-        self.output_rank = output_rank
-        self.world_size = world_size
-        self._send_remaining_count = defaultdict[str,
-                                                 int](lambda: self.world_size)
-        self._recv_remaining_count = defaultdict[str,
-                                                 int](lambda: self.world_size)
-        self.io_thread_pool = None
-        self.shutdown_event = threading.Event()
-
-
-class DummyModelRunnerOutput(ModelRunnerOutput):
-
-    def __init__(self,
-                 finished_sending: Optional[set[str]] = None,
-                 finished_recving: Optional[set[str]] = None):
-        self.finished_sending = finished_sending
-        self.finished_recving = finished_recving
-
-
-def test_aggregate_workers_output():
-    executor = DummyMultiprocExecutor(output_rank=0, world_size=2)
-
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-
-    aggregated = executor._aggregate_workers_output([output1, output2])
-
-    assert aggregated is output1
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
-
-    aggregated = executor._aggregate_workers_output([output1, output2])
-
-    assert aggregated is output1
-    assert aggregated.finished_sending == {'req1'}
-    assert aggregated.finished_recving is None
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-
-    aggregated = executor._aggregate_workers_output([output1, output2])
-
-    assert aggregated is output1
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
-
-
-def test_async_aggregate_workers_output():
-    executor = DummyMultiprocExecutor(output_rank=0, world_size=2)
-
-    future1: Future[DummyModelRunnerOutput] = Future()
-    future2: Future[DummyModelRunnerOutput] = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving is None
-
-    future1 = Future()
-    future2 = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    assert aggregated.finished_sending == {'req1'}
-    assert aggregated.finished_recving is None
-
-    future1 = Future()
-    future2 = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
-
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    future1.set_result(output1)
-    future2.set_result(output2)
-
-    assert result_future.done()
-    aggregated = result_future.result()
-    assert aggregated is output1
-    assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@ -6,10 +6,6 @@ from unittest import mock
 import pytest
 import torch

-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      get_attention_backend)
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
@ -68,19 +64,13 @@ def test_prepare_inputs():
    """
    device = torch.device(current_platform.device_type)

-    # q1 = 4, q2 = 7, q3 = 5
+    # a = 4, b = 7, c = 5
    # n1 = 1, n2 = 3, n3 = 2

-    batch_spec = BatchSpec(
-        seq_lens=[4, 7, 5],
-        query_lens=[4, 7, 5],
-    )
-
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec,
-        block_size=16,
-        device=device,
-    )
+    # Cumulative lengths: [0, 4, 11, 16]
+    cu_target_query_lens = torch.tensor([0, 4, 11, 16],
+                                        dtype=torch.int32,
+                                        device=device)

    # Rejected tokens per request: [1, 3, 2]
    num_rejected_tokens = torch.tensor([1, 3, 2],
@ -114,13 +104,15 @@ def test_prepare_inputs():
        ],
        dtype=torch.int32,
        device=device)
-    proposer = _create_proposer("eagle", 1)

-    updated_metadata, token_indices = proposer.prepare_inputs(
-        common_attn_metadata, num_rejected_tokens.cpu())
+    # n1 + n2 + n3 - a - b -c
+    num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum(
+    ).item()

-    assert torch.equal(updated_metadata.query_start_loc,
-                       expected_cu_num_tokens)
+    cu_num_tokens, token_indices = EagleProposer.prepare_inputs(
+        cu_target_query_lens, num_rejected_tokens, num_tokens)
+
+    assert torch.equal(cu_num_tokens, expected_cu_num_tokens)
    assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
    assert torch.equal(token_indices, expected_token_indices)

@ -217,7 +209,6 @@ def test_propose(num_speculative_tokens):
    seq_len_2 = 3
    total_tokens = seq_len_1 + seq_len_2
    vocab_size = 100
-    seq_lens = [seq_len_1, seq_len_2]

    # Create proposer first so we can use its actual hidden_size
    proposer = _create_proposer("eagle", num_speculative_tokens)
@ -279,16 +270,9 @@ def test_propose(num_speculative_tokens):
    proposer.attn_layer_names = ["layer.0"]

    # Create input tensors
-    batch_spec = BatchSpec(
-        seq_lens=seq_lens,
-        query_lens=seq_lens,
-    )
-
-    common_attn_metadata = create_common_attn_metadata(
-        batch_spec,
-        block_size=16,
-        device=device,
-    )
+    cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens],
+                                 dtype=torch.int32,
+                                 device=device)

    target_token_ids = torch.randint(0,
                                     vocab_size, (total_tokens, ),
@ -300,29 +284,25 @@ def test_propose(num_speculative_tokens):
    target_hidden_states = torch.randn(total_tokens,
                                       hidden_size,
                                       device=device)
+    target_slot_mapping = torch.randint(0,
+                                        100, (total_tokens, ),
+                                        device=device)
    next_token_ids = torch.randint(0,
                                   vocab_size, (batch_size, ),
                                   dtype=torch.int32,
                                   device=device)
+    block_table = torch.randint(0, 10, (batch_size, 10), device=device)
+
    sampling_metadata = mock.MagicMock()

-    attn_metadata_builder_cls, _ = get_attention_backend(
-        _Backend.FLASH_ATTN_VLLM_V1)
-    attn_metadata_builder = attn_metadata_builder_cls(
-        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
-        vllm_config=proposer.vllm_config,
-        device=device,
-    )
-
-    # Mock runner for attention metadata building
-    proposer.runner = mock.MagicMock()
-    proposer.runner.attn_metadata_builders = [attn_metadata_builder]
-
+    # Call the method under test
    result = proposer.propose(target_token_ids=target_token_ids,
                              target_positions=target_positions,
                              target_hidden_states=target_hidden_states,
+                              target_slot_mapping=target_slot_mapping,
                              next_token_ids=next_token_ids,
-                              common_attn_metadata=common_attn_metadata,
+                              cu_num_tokens=cu_num_tokens,
+                              block_table=block_table,
                              sampling_metadata=sampling_metadata)

    assert result.shape == (batch_size, num_speculative_tokens)
--- a/Show More
+++ b/Show More