Update Dockerfile

Signed-off-by: Kevin H. Luu <kevin@anyscale.com>
[CI] Enable all hf transformers baselines in test_hybrid (#23936 )
2025-09-02 13:22:11 -07:00 · 2025-09-02 20:15:06 +00:00 · 2025-09-02 18:53:34 +00:00 · 2025-09-02 18:34:28 +00:00 · 2025-09-02 18:10:10 +00:00 · 2025-09-02 10:56:31 -07:00
271 changed files with 8272 additions and 3403 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
@ -89,17 +89,33 @@ function cpu_tests() {
    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"

-  # online serving
+  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
-      --endpoint /v1/completions'
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -462,8 +462,8 @@ steps:
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
@ -566,8 +566,7 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing

 - label: Multi-Modal Models Test (Standard)
  mirror_hardwares: [amdexperimental]
@ -770,6 +769,11 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y 
+  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -49,6 +49,10 @@ jobs:
                    term: "VLLM_ROCM_",
                    searchIn: "both"
                  },
+                  {
+                    term: "aiter",
+                    searchIn: "title"
+                  },
                  {
                    term: "rocm",
                    searchIn: "title"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

 #
 # Try to find python package with an executable that exactly matches
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -110,7 +110,12 @@ become available.

 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```

 ## 🚀 Example - Online Benchmark

--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -16,6 +16,7 @@ assert current_platform.is_cuda(), (
 # DeepSeek-V3 weight shapes
 DEEPSEEK_V3_SHAPES = [
    (512 + 64, 7168),
+    (2112, 7168),
    ((128 + 64) * 128, 7168),
    (128 * (128 + 128), 512),
    (7168, 16384),
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
    # cannot TP
    total = [
        (512 + 64, 7168),
+        (2112, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -27,11 +27,12 @@

 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
         bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
 struct Selective_Scan_fwd_kernel_traits {
    static_assert(kNItems_ % 4 == 0);
    using input_t = input_t_;
    using weight_t = weight_t_;
+    using state_t = state_t_;
    static constexpr int kNThreads = kNThreads_;
    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
    cache_index * params.ssm_states_batch_stride + 
    dim_id * kNRows * params.ssm_states_dim_stride;
    
@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                if (threadIdx.x == 0) {
                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
                    if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                    }
                }
                #pragma unroll
@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    }
 }

-template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
    // processing 1 row.
@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                dim3 grid(params.batch, params.dim / kNRows);
                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    });
 }

-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {

    #ifndef USE_ROCM
        if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
        }
    #else
        if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
        }
    #endif
 }

-template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);

 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")

-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
    if (ITYPE == at::ScalarType::Half) {                                            \
        using input_t = at::Half;                                                   \
        using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::Half) {                                        \
+            using state_t = at::Half;                                               \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
        using input_t = at::BFloat16;                                               \
        using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::BFloat16) {                                    \
+            using state_t = at::BFloat16;                                           \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
    } else if (ITYPE == at::ScalarType::Float)  {                                   \
        using input_t = float;                                                      \
        using weight_t = float;                                                     \
+        using state_t = float;                                                      \
        __VA_ARGS__();                                                              \
    } else {                                                                        \
        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
    }


-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);

 void set_ssm_params_fwd(SSMParamsBase &params,
@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,

    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
    at::Tensor out = delta;
-    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    // ssm_states can now be either the same as input_type or float32
+    auto state_type = ssm_states.scalar_type();
+    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
    TORCH_CHECK(ssm_states.is_cuda());
    TORCH_CHECK(ssm_states.stride(-1) == 1);

@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    
    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
    });
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -130,8 +130,8 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);

-#ifndef USE_ROCM
-
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& output_block_scale,
                              torch::Tensor& input,
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -115,7 +115,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

-#ifndef USE_ROCM
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
  ops.def(
      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
      "Tensor input, Tensor input_global_scale) -> ()");
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -82,6 +82,7 @@ ARG GET_PIP_URL
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get clean -y \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
@ -432,11 +433,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
-    && rm /tmp/install_deepgemm.sh
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 

 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -174,8 +174,10 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u

 Known supported models:

+- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
+- Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
- MiniCPM-V-4 (<gh-pr:23327>)
+- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).

 With these two steps, interleave sliding windows should work with the model.
+
+### How to support models that use Mamba?
+
+We consider 3 different scenarios:
+
+1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
+2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
+3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
+
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
+For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
+V0-only classes and code will be removed in the very near future.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
+
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+Please follow the same guidelines as case (2) for implementing these models.
+We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
+It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
+Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
+Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -73,6 +73,8 @@ apt install nsight-systems-cli

 ### Example commands and usage

+When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+
 #### Offline Inference

 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
    metadata:
      name: vllm
    spec:
-      replicas: 2
+      replicas: 1
      leaderWorkerTemplate:
        size: 2
        restartPolicy: RecreateGroupOnPodRestart
@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
                  - sh
                  - -c
                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                    vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                resources:
                  limits:
                    nvidia.com/gpu: "8"
@ -126,8 +126,6 @@ Should get an output similar to this:
 NAME       READY   STATUS    RESTARTS   AGE
 vllm-0     1/1     Running   0          2s
 vllm-0-1   1/1     Running   0          2s
-vllm-1     1/1     Running   0          2s
-vllm-1-1   1/1     Running   0          2s
 ```

 Verify that the distributed tensor-parallel inference works:
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -0,0 +1,78 @@
+# IO Processor Plugins
+
+IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+
+## Writing an IO Processor Plugin
+
+IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+
+```python
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
+```
+
+The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
+The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
+The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
+
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+
+## Using an IO Processor plugin
+
+IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
+
+1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
+2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
+
+The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -49,6 +49,8 @@ Every plugin has three parts:

 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+
 ## Guidelines for Writing Plugins

 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].

+### Stable UUIDs for Caching (multi_modal_uuids)
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ### Image Inputs

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).

@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`

-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.

 ### Which quantization configs does vLLM CPU support?

@ -193,3 +194,35 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
 - Both of them require `amx` CPU flag.
    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+
+### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
+
+In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
+
+To enable these optimizations inside Docker with the least privilege, you can follow below tips:
+
+```bash
+docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined  ...
+
+# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue.
+
+# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`.
+# Actually, `seccomp=unconfined` bypasses the seccomp for container,
+# if it's unacceptable, you can customize your own seccomp profile,
+# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list.
+
+# reference : https://docs.docker.com/engine/security/seccomp/
+```
+
+Alternatively, running with `--privileged=true` also works but is broader and not generally recommended.
+
+In K8S, the following configuration can be added to workload yaml to achieve the same effect as above:
+
+```yaml
+securityContext:
+  seccompProfile:
+    type: Unconfined
+  capabilities:
+    add:
+    - SYS_NICE
+```
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@ -48,6 +48,10 @@ docker run --rm \
            --dtype=bfloat16 \
            other vLLM OpenAI server arguments
 ```
+
+!!! tip
+    An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@ -16,8 +16,8 @@ cd vllm_source
 Third, install required dependencies:

 ```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend auto
-uv pip install -r requirements/cpu.txt --torch-backend auto
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```

 ??? console "pip"
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@ -89,6 +89,9 @@ docker run --rm \
    other vLLM OpenAI server arguments
 ```

+!!! tip
+    An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -43,7 +43,8 @@ docker build -f docker/Dockerfile.cpu \

 # Launching OpenAI server
 docker run --rm \
-            --privileged=true \
+            --security-opt seccomp=unconfined \
+            --cap-add SYS_NICE \
            --shm-size=4g \
            -p 8000:8000 \
            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -335,9 +335,9 @@ th {
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
@ -634,7 +634,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models

 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
-Please note that prefix caching is not yet supported for these models.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.

-Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
-Please note that prefix caching is not yet supported for these models.

-Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
-It is also necessary to enforce eager mode for these models in V1.
+Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
+
+Please note that prefix caching is not yet supported for any of the above models.

 #### Encoder-Decoder Models

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -23,7 +23,7 @@ def create_test_prompts(
    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
-    with the second LoRA adapter will be ran after all requests with the
+    with the second LoRA adapter will be run after all requests with the
    first adapter have finished.
    """
    return [
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import os
+
+import torch
+
+from vllm import LLM
+from vllm.pooling_params import PoolingParams
+
+# This example shows how to perform an offline inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirement - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+
+
+def main():
+    torch.set_default_dtype(torch.float16)
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    llm = LLM(
+        model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        skip_tokenizer_init=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM.
+        # The maximum number depends on the available GPU memory
+        max_num_seqs=32,
+        io_processor_plugin="prithvi_to_tiff_india",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooler_output = llm.encode(
+        img_prompt,
+        pooling_params=pooling_params,
+    )
+    output = pooler_output[0].outputs
+
+    print(output)
+    decoded_data = base64.b64decode(output.data)
+
+    file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
+    with open(file_path, "wb") as f:
+        f.write(decoded_data)
+
+    print(f"Output file path: {file_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Keye-VL-1.5
+def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1648,6 +1679,7 @@ model_example_map = {
    "interns1": run_interns1,
    "internvl_chat": run_internvl,
    "keye_vl": run_keye_vl,
+    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
    "llama4": run_llama4,
    "llava": run_llava,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

@ -1209,6 +1246,7 @@ model_example_map = {
    "interns1": load_interns1,
    "internvl_chat": load_internvl,
    "keye_vl": load_keye_vl,
+    "keye_vl1_5": load_keye_vl1_5,
    "kimi_vl": load_kimi_vl,
    "llama4": load_llama4,
    "llava": load_llava,
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
    token_ids: list[int]
    block_size: int
    lora_id: Optional[int]
+    medium: Optional[str]


 class BlockRemoved(KVCacheEvent):
    block_hashes: list[int]
+    medium: Optional[str]


 class AllBlocksCleared(KVCacheEvent):
--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
@ -11,7 +11,7 @@
 # Example usage:
 # On the head node machine, start the Ray head node process and run a vLLM server.
 #   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
-#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
 # 
 # On each worker node, start the Ray worker node process.
 #   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
    print("Chat completion output from base64 encoded audio:", result)


+def run_multi_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+
 example_function_map = {
    "text-only": run_text_only,
    "single-image": run_single_image,
    "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
    "video": run_video,
    "audio": run_audio,
 }
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+
+import requests
+
+# This example shows how to perform an online inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirements :
+# - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - start vllm in serving mode with the below args
+#   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --task embed --trust-remote-code
+#   --skip-tokenizer-init --enforce-eager
+#   --io-processor-plugin prithvi_to_tiff_india
+
+
+def main():
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    server_endpoint = "http://localhost:8000/pooling"
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        "softmax": False,
+    }
+
+    ret = requests.post(server_endpoint, json=request_payload_url)
+
+    print(f"response.status_code: {ret.status_code}")
+    print(f"response.reason:{ret.reason}")
+
+    response = ret.json()
+
+    decoded_image = base64.b64decode(response["data"]["data"])
+
+    out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
+
+    with open(out_path, "wb") as f:
+        f.write(decoded_image)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@ -402,7 +402,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -418,7 +418,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -435,7 +435,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -452,7 +452,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -468,7 +468,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "Mean",
@ -476,7 +476,7 @@
          "refId": "E"
        }
      ],
-      "title": "Time Per Output Token Latency",
+      "title": "Inter Token Latency",
      "type": "timeseries"
    },
    {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<80.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.7.1",
+    "torch == 2.8.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,7 +4,8 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.1
+torch==2.8.0
 wheel
 jinja2>=3.1.6
 regex
+build
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -9,17 +9,16 @@ packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le"
-torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
+torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"

 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts

 # Intel Extension for PyTorch, only for x86_64 CPUs
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.1
-torchaudio==2.7.1
+torch==2.8.0
+torchaudio==2.8.0
 # These must be updated alongside torch
-torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0

 triton==3.3.0
 cmake>=3.26.1,<4
--- a/requirements/test.in
+++ b/requirements/test.in
@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -541,42 +541,42 @@ numpy==1.26.4
    #   tritonclient
    #   vocos
    #   xarray
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
    # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
    # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
    # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
    # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
    # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
    # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
    # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
    # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
    # via
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
    # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
    # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvtx-cu12==12.8.90
    # via torch
 omegaconf==2.3.0
    # via
@ -1069,7 +1069,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.7.1+cu128
+torch==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   accelerate
@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   encodec
@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.22.1+cu128
+torchvision==0.23.0+cu128
    # via
    #   -r requirements/test.in
    #   lightly
@ -1152,7 +1152,7 @@ transformers==4.55.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.3.1
+triton==3.4.0
    # via torch
 tritonclient==2.51.0
    # via
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
        pool.join()

        # check cancellation stats
-        # give it some times to update the stats
+        # give it some time to update the stats
        time.sleep(1)

        num_aborted_requests = requests.get(
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1120,6 +1120,9 @@ class VllmRunner:

        return self.llm.llm_engine.collective_rpc(_apply_model)

+    def get_llm(self) -> LLM:
+        return self.llm
+
    def __enter__(self):
        return self

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
@pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                  test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
+    """Verify block manager v2 with auto prefix caching could work normally
    even when eviction started.
    With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    Then blocks are managed by evictor instead. If cache hit at the evictor's
    block, then it could be reused, or we need to recompute its kv cache.
    """
    output_len = 10
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -292,7 +292,7 @@ SP_TEST_MODELS = [
    # TODO support other models
    # [LANGUAGE GENERATION]
    "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]


--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -167,7 +167,7 @@ def test_get_kwargs():
    # dict should have json tip in help
    json_tip = "Should either be a valid JSON string or JSON keys"
    assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
+    # nested config should construct the nested config
    assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)


@ -337,10 +337,6 @@ def test_human_readable_model_len():
    args = parser.parse_args(["--max-model-len", "10.212345k"])
    assert args.max_model_len == 10212

-    # Auto via -1
-    args = parser.parse_args(["--max-model-len", "-1"])
-    assert args.max_model_len == -1
-
    # Invalid (do not allow decimals with binary multipliers)
    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
        with pytest.raises(ArgumentError):
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@ -201,3 +201,32 @@ table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
 """)
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    """Create zephyr LoRA files with added tokens once per test session."""
+    import shutil
+    from tempfile import TemporaryDirectory
+
+    from transformers import AutoTokenizer
+
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
    err_msg = "pooling_task must be one of.+"
    with pytest.raises(ValueError, match=err_msg):
        llm.encode(prompts, use_tqdm=False)
+
+
+def test_score_api(llm: LLM):
+    err_msg = "Score API is only enabled for num_labels == 1."
+    with pytest.raises(ValueError, match=err_msg):
+        llm.score("ping", "pong", use_tqdm=False)
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -15,8 +15,6 @@ import torch
 from openai import BadRequestError, OpenAI

 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401

 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": model_name,
+            "text_1": "ping",
+            "text_2": "pong",
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": "ping",
+            "documents": ["pong"],
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@ -27,6 +27,28 @@ def serve_parser():
    return make_arg_parser(parser)


+### Test config parsing
+def test_config_arg_parsing(serve_parser, cli_config_file):
+    args = serve_parser.parse_args([])
+    assert args.port == 8000
+    args = serve_parser.parse_args(['--config', cli_config_file])
+    assert args.port == 12312
+    args = serve_parser.parse_args([
+        '--config',
+        cli_config_file,
+        '--port',
+        '9000',
+    ])
+    assert args.port == 9000
+    args = serve_parser.parse_args([
+        '--port',
+        '9000',
+        '--config',
+        cli_config_file,
+    ])
+    assert args.port == 9000
+
+
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
    # Test old format: name=path
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@ -3,8 +3,6 @@
 # imports for guided decoding tests
 import json
 import os
-import shutil
-from tempfile import TemporaryDirectory
 from typing import Optional

 import jsonschema
@ -14,9 +12,7 @@ import pytest_asyncio
 import regex as re
 import requests
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoTokenizer

 from vllm.transformers_utils.tokenizer import get_tokenizer

@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"

 GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]


-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
@pytest.fixture(scope="module")
 def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
    return [
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -3,48 +3,23 @@

 import base64
 import io
-import shutil
-from tempfile import TemporaryDirectory

 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import torch
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoConfig

 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"

 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)


-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
@pytest.fixture(scope="module")
 def default_server_args(
    zephyr_lora_files,
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@ -9,8 +9,6 @@ from contextlib import suppress
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download

 from ...utils import RemoteOpenAIServer

@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"

 BADREQUEST_CASES = [
    (
@ -48,11 +45,6 @@ BADREQUEST_CASES = [
 ]


-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
@pytest.fixture(scope="module")
 def monkeypatch_module():
    from _pytest.monkeypatch import MonkeyPatch
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -47,6 +47,7 @@ class MockModelConfig:
    allowed_local_media_path: str = ""
    encoder_config = None
    generation_config: str = "auto"
+    skip_tokenizer_init: bool = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_params_max_tokens_sum",
    "vllm:request_params_max_tokens_bucket",
    "vllm:request_params_max_tokens_count",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
    "vllm:time_per_output_token_seconds_sum",
    "vllm:time_per_output_token_seconds_bucket",
    "vllm:time_per_output_token_seconds_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:inter_token_latency_seconds_sum",
+    "vllm:inter_token_latency_seconds_bucket",
+    "vllm:inter_token_latency_seconds_count",
    "vllm:e2e_request_latency_seconds_sum",
    "vllm:e2e_request_latency_seconds_bucket",
    "vllm:e2e_request_latency_seconds_count",
@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_decode_time_seconds_count",
 ]

-HIDDEN_DEPRECATED_METRICS: list[str] = []
+HIDDEN_DEPRECATED_METRICS: list[str] = [
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]


@pytest.mark.asyncio
@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
    assert response.status_code == HTTPStatus.OK

    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (not server.show_hidden_metrics
-                and metric not in HIDDEN_DEPRECATED_METRICS):
-            assert metric in response.text
+        if (metric in HIDDEN_DEPRECATED_METRICS
+                and not server.show_hidden_metrics):
+            continue
+        assert metric in response.text


@pytest.mark.asyncio
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@ -4,8 +4,6 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download

 from ...utils import RemoteOpenAIServer

@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer
 from .test_completion import default_server_args  # noqa: F401
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 from .test_completion import MODEL_NAME


--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+
+
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+
+        # Verify all fields are present
+        assert (completion.choices[0].token_ids is not None
+                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert completion.choices[0].prompt_token_ids is not None
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@ -8,8 +8,6 @@ import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401

 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "openai/whisper-large-v3-turbo"
@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]


-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
@pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
        assert out_usage["seconds"] == 16, out_usage["seconds"]


+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
@pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
    # text to text model
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -12,32 +12,24 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "openai/whisper-small"
 SERVER_ARGS = ["--enforce-eager"]


-@pytest.fixture
-def foscolo():
-    # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module",
+                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param


@pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
    async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name


@pytest.mark.asyncio
@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):

 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
-async def test_basic_audio(foscolo, client):
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="text",
-        # TODO remove once language detection is implemented
-        extra_body=dict(language="it"),
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert "greek sea" in out


@pytest.mark.asyncio
-async def test_audio_prompt(foscolo, client):
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    transcription = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        prompt=prompt,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(transcription)['text']
@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):


@pytest.mark.asyncio
-async def test_streaming_response(foscolo, client, server):
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
    translation = ""
    res_no_stream = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="json",
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en", seed=42),
        temperature=0.0)
+
    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "temperature": 0.0,
+        "seed": 42,
    }
    foscolo.seek(0)
    async with httpx.AsyncClient() as http_client:
@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
                text = chunk["choices"][0].get("delta", {}).get("content")
                translation += text or ""

-    assert translation == res_no_stream.text
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert sum([
+        x == y for x, y in zip(res_stream, res_no_stream.text.split())
+    ]) >= len(res_stream) * 0.9


@pytest.mark.asyncio
-async def test_stream_options(foscolo, client, server):
+async def test_stream_options(foscolo, server):
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "stream_include_usage": True,
        "stream_continuous_usage_stats": True,
@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):


@pytest.mark.asyncio
-async def test_long_audio_request(foscolo, client):
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=buffer,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
        a1_scale=a1_scale,
        block_shape=block_shape,
        # Make sure this is set to False so we
-        # dont end up comparing the same implementation.
+        # don't end up comparing the same implementation.
        allow_deep_gemm=False)


--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@ -8,7 +8,8 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types

-if not current_platform.has_device_capability(100):
+if not (current_platform.has_device_capability(100)
+        and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")):
    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
                allow_module_level=True)

--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -59,10 +59,10 @@ async def requests_processing_time(llm,
@pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
    """ 
-    The add_lora function is used to pre-load some LoRA adapters into the
+    The add_lora function is used to preload some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
-    do it twice, once with add_lora() pre-loading and once without.
+    do it twice, once with add_lora() preloading and once without.

    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
                   enable_lora=True,
@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                   tensor_parallel_size=4,
                   trust_remote_code=True,
                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
    adapters that define additional tokens.
    """

-    # Setup a base model compatible with the sql_lora_files adapter and
+    # Set up a base model compatible with the sql_lora_files adapter and
    # a known number of tokens in the base model.
    model_config = ModelConfig(
        model=llama_2_7b_base_huggingface_id,
@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
    adapters that do not define additional tokens.
    """

-    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # Set up a base model compatible with the qwen25vl_lora_files adapter and
    # a known number of tokens in the base model.
    model_config = ModelConfig(
        model=qwen25vl_base_huggingface_id,
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

 # These have unsupported head_dim for FA. We do not
-# not have a clean way to fall back, so we fail with
+# have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
@ -92,7 +92,8 @@ AITER_MODEL_LIST = [
        pytest.param(
            "allenai/OLMoE-1B-7B-0924-Instruct",
            marks=[pytest.mark.cpu_model],
-        )
+        ),
+        pytest.param("swiss-ai/Apertus-8B"),  # apertus
    ])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -34,17 +34,6 @@ HYBRID_MODELS = [
    "LiquidAI/LFM2-1.2B",
 ]

-HF_UNSUPPORTED_MODELS = [
-    # The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    # transformers 4.55 is still producing garbage for this model
-    # TODO(tdoublep): follow-up on transformers side
-    "ibm-granite/granite-4.0-tiny-preview"
-]
-
 V1_SUPPORTED_MODELS = [
    "state-spaces/mamba-130m-hf",
    "ai21labs/Jamba-tiny-dev",
@ -65,6 +54,11 @@ V0_UNSUPPORTED_MODELS = [
    "LiquidAI/LFM2-1.2B",
 ]

+FP32_STATE_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4

@ -85,20 +79,13 @@ def test_models(
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
-        hf_version_check = model_info.check_transformers_version(
-            on_fail="return")
+        model_info.check_transformers_version(on_fail="skip")
    except ValueError:
-        hf_version_check = None
-
-    if hf_version_check is not None:
-        print(f"Skipping transformers comparison because: {hf_version_check}")
+        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@ -116,7 +103,7 @@ def test_models(
    else:
        vllm_v1_outputs = None

-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@ -125,12 +112,10 @@ def test_models(
        )

    if model in V1_SUPPORTED_MODELS:
-        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-        assert ref_outputs is not None
        check_logprobs_close(
-            outputs_0_lst=ref_outputs,
+            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v1_outputs,
-            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_0="hf",
            name_1="vllm-v1",
        )

@ -397,11 +382,8 @@ def test_full_cuda_graph(
        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@ -416,7 +398,7 @@ def test_full_cuda_graph(
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@ -424,17 +406,15 @@ def test_full_cuda_graph(
            name_1="vllm-v0",
        )

-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-    assert ref_outputs is not None
    check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
        name_1="vllm-v1",
    )


-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_fp32_state(
@ -455,11 +435,8 @@ def test_fp32_state(
        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@ -475,18 +452,16 @@ def test_fp32_state(
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if hf_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
    check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_v0_outputs,
+        name_0="hf",
+        name_1="vllm-v0",
+    )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
        name_1="vllm-v1",
    )
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.3",
    # uses the v3-Tekken tokenizer
    "mistralai/Ministral-8B-Instruct-2410",
-    # Mistral-Nemo is to big for CI, but passes locally
+    # Mistral-Nemo is too big for CI, but passes locally
    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:


 def test_mistral_function_call_nested_json():
-    """Ensure that the function-name regex captures the entire outer-most
+    """Ensure that the function-name regex captures the entire outermost
    JSON block, including nested braces."""

    # Create a minimal stub tokenizer that provides the few attributes the
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@ -154,7 +154,7 @@ def batch_make_image_embeddings(
        embed_counter += cur_batch_embed_len
        image_counter += cur_batch_image_count

-    # ensure we don't lost any images or embeddings
+    # ensure we don't lose any images or embeddings
    assert embed_counter == image_embeds.size(0)
    assert image_counter == image_grid_thw.size(0)
    assert len(image_batches) == len(result)
@ -238,7 +238,7 @@ def batch_make_video_embeddings(
        embed_counter += cur_batch_embed_len
        video_counter += cur_batch_video_count

-    # ensure we don't lost any videos or embeddings
+    # ensure we don't lose any videos or embeddings
    assert embed_counter == video_embeds.size(0)
    assert video_counter == video_grid_thw.size(0)
    assert len(video_batches) == len(result)
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@ -1,12 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from io import BytesIO
 from typing import Callable

-import requests
-from PIL import Image
-
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():


 def windows_attention_image_qwen2_5_vl():
-    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
-    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
+    image = ImageAsset("hato").pil_image

    question = "Describe the image."
    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -293,6 +293,7 @@ def _test_processing_correctness_one(
    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
    "OpenGVLab/InternVL3_5-30B-A3B",
    "Kwai-Keye/Keye-VL-8B-Preview",
+    "Kwai-Keye/Keye-VL-1_5-8B",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "llava-hf/llava-1.5-7b-hf",
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -1,30 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
 from collections.abc import Iterable
+from contextlib import contextmanager
 from functools import partial
 from typing import Any, Union
-from unittest.mock import patch

 import numpy as np
 import pytest
+import torch.nn as nn
 from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

-from vllm.config import ModelConfig
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
-from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-from vllm.v1.engine.core import EngineCore as V1EngineCore
+from vllm.utils import is_list_of

-from ....conftest import VllmRunner
 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides

@ -137,6 +138,27 @@ def create_batched_mm_kwargs(
    return group_mm_kwargs_by_modality(items)


+@contextmanager
+def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=1)
+    vllm_config = VllmConfig(model_config=model_config)
+    with set_current_vllm_config(vllm_config=vllm_config):
+        with set_default_torch_dtype(model_config.dtype):
+            model = model_cls(vllm_config=vllm_config)
+        yield model
+
+    del model
+    cleanup_dist_env_and_memory()
+
+
 def get_model_id_to_test(
        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
    filtered_results = []
@ -155,8 +177,7 @@ def get_model_id_to_test(
@pytest.mark.parametrize(
    "model_arch, model_id",
    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, model_id: str,
-                             vllm_runner: type[VllmRunner], monkeypatch):
+def test_model_tensor_schema(model_arch: str, model_id: str):
    if model_arch in ARCH_TO_SKIP:
        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
    if model_id in REPO_ID_TO_SKIP:
@ -177,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+        hf_overrides=hf_overrides_fn,
    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]

-    if not any(
-            hasattr(model_cls, f"_parse_and_validate_{m}_input")
-            for m in ["image", "video", "audio"]):
+    inputs_parse_methods = []
+    for attr_name in dir(model_cls):
+        attr = getattr(model_cls, attr_name)
+        if hasattr(attr, "__annotations__"):
+            return_type = attr.__annotations__.get("return", None)
+            if return_type is not None and "Input" in str(return_type):
+                inputs_parse_methods.append(attr_name)
+
+    if not any(inputs_parse_methods):
        pytest.skip(f"{model_arch} does not support tensor schema validation.")

    ctx = InputProcessingContext(
@ -197,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        modality: 3 if limit is None else limit
        for modality, limit in supported_mm_limits.items()
    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+    processor = factories.build_processor(ctx, cache=None)

-    # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
-    def _initialize_kv_caches_v1(self, vllm_config):
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-        scheduler_kv_cache_config = get_kv_cache_config(
-            vllm_config,
-            kv_cache_specs[0],
-            10 * GiB_bytes,
-        )
-
-        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-        return 1, 0, scheduler_kv_cache_config
-
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
-
-        # TODO(Isotr0py): Can we avoid initializing engine?
-        with (
-                set_default_torch_num_threads(1),
-                vllm_runner(
-                    model_id,
-                    tokenizer_name=model_info.tokenizer,
-                    tokenizer_mode=model_info.tokenizer_mode,
-                    revision=model_info.revision,
-                    trust_remote_code=model_info.trust_remote_code,
-                    max_model_len=model_info.max_model_len,
-                    load_format="dummy",
-                    hf_overrides=hf_overrides_fn,
-                    limit_mm_per_prompt=limit_mm_per_prompt,
-                    enforce_eager=True,
-                ) as vllm_model,
-        ):
-            model_config = vllm_model.llm.llm_engine.model_config
-            llm_engine = vllm_model.llm.llm_engine
-
-            if hasattr(llm_engine, "processor"):
-                # v1 processor
-                mm_registry = llm_engine.processor.mm_registry
-            else:
-                # v0 input_preprocessor
-                mm_registry = llm_engine.input_preprocessor.mm_registry
-
-            processor = mm_registry.create_processor(model_config)
-
-            def validate_model_input(model, modality: str,
-                                     mm_kwargs: MultiModalKwargs):
-                method_name = f"_parse_and_validate_{modality}_input"
-                if hasattr(model, method_name):
-                    getattr(model, method_name)(**mm_kwargs)
-
-            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                    model_config, processor):
-                valid_func = partial(validate_model_input,
-                                     modality=modality,
-                                     mm_kwargs=mm_kwargs)
-                vllm_model.apply_model(valid_func)
+    with initialize_dummy_model(model_cls, model_config) as model:
+        for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                model_config, processor):
+            for method_name in inputs_parse_methods:
+                print(f"Testing `{method_name}` with modality={modality} "
+                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                getattr(model, method_name)(modality=modality, **mm_kwargs)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -137,6 +137,9 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+                                          min_transformers_version="4.56.0",
+                                          trust_remote_code=True),
    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
                                   trust_remote_code=True),
    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
@ -151,7 +154,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                         trust_remote_code=True),
    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                        {"1b": "bigscience/bloomz-1b1"}),
@ -205,7 +208,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
+                                                   min_transformers_version="4.55.3"),
    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                             trust_remote_code=True),
@ -225,7 +229,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                            trust_remote_code=True),
    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                        extras={
                                            "tiny": "ai21labs/Jamba-tiny-dev",
                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
@ -241,7 +245,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                         is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
+                                         min_transformers_version="4.55.3",
+                                         extras={
+                                            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
+                                         }),
    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                         trust_remote_code=True),
@ -435,6 +443,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                    trust_remote_code=True),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
+                                                         trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                      trust_remote_code=True),
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS

@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Skip if transformers version is incompatible
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
    # Ensure all model classes can be imported successfully
    model_cls = ModelRegistry._try_load_model_cls(model_arch)
    assert model_cls is not None
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@ -636,8 +636,10 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,

    # Run the model through the sharded function
    with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                           pixel_values,
+                                                           grid_thw_list,
+                                                           rope_type="rope_3d")
        sharded_output = torch.cat(sharded_output, dim=0)

    # Check that the world size is setup correctly
@ -691,8 +693,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(

    # Should handle empty input gracefully
    with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
-                                                   grid_thw_list)
+        output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                   pixel_values,
+                                                   grid_thw_list,
+                                                   rope_type="rope_3d")

    assert len(output) == 0

@ -745,8 +749,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(

    # Should handle uneven distribution without errors
    with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")

    # Verify output shape is reasonable
    merge_factor = vision_model.spatial_merge_size**2
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/init.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/init.py
@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def register_prithvi_india():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"  # noqa: E501
+
+
+def register_prithvi_valencia():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia"  # noqa: E501
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@ -0,0 +1,432 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import base64
+import datetime
+import os
+import tempfile
+import urllib.request
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import albumentations
+import numpy as np
+import rasterio
+import regex as re
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.protocol import (IOProcessorRequest,
+                                              IOProcessorResponse)
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput
+from vllm.plugins.io_processors.interface import (IOProcessor,
+                                                  IOProcessorInput,
+                                                  IOProcessorOutput)
+
+from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput
+
+logger = init_logger(__name__)
+
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+
+DEFAULT_INPUT_INDICES = [0, 1, 2, 3, 4, 5]
+
+datamodule_config: DataModuleConfig = {
+    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
+    "batch_size":
+    16,
+    "constant_scale":
+    0.0001,
+    "data_root":
+    "/dccstor/geofm-finetuning/datasets/sen1floods11",
+    "drop_last":
+    True,
+    "no_data_replace":
+    0.0,
+    "no_label_replace":
+    -1,
+    "num_workers":
+    8,
+    "test_transform": [
+        albumentations.Resize(always_apply=False,
+                              height=448,
+                              interpolation=1,
+                              p=1,
+                              width=448),
+        albumentations.pytorch.ToTensorV2(transpose_mask=False,
+                                          always_apply=True,
+                                          p=1.0),
+    ],
+}
+
+
+def save_geotiff(image: torch.Tensor, meta: dict,
+                 out_format: str) -> str | bytes:
+    """Save multi-band image in Geotiff file.
+
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+    if out_format == "path":
+        # create temp file
+        file_path = os.path.join(os.getcwd(), "prediction.tiff")
+        with rasterio.open(file_path, "w", **meta) as dest:
+            for i in range(image.shape[0]):
+                dest.write(image[i, :, :], i + 1)
+
+        return file_path
+    elif out_format == "b64_json":
+        with tempfile.NamedTemporaryFile() as tmpfile:
+            with rasterio.open(tmpfile.name, "w", **meta) as dest:
+                for i in range(image.shape[0]):
+                    dest.write(image[i, :, :], i + 1)
+
+            file_data = tmpfile.read()
+            return base64.b64encode(file_data)
+
+    else:
+        raise ValueError("Unknown output format")
+
+
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+
+    return image
+
+
+def read_geotiff(
+    file_path: Optional[str] = None,
+    path_type: Optional[str] = None,
+    file_data: Optional[bytes] = None,
+) -> tuple[torch.Tensor, dict, tuple[float, float] | None]:
+    """Read all bands from *file_path* and return image + meta info.
+
+    Args:
+        file_path: path to image file.
+
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+
+    if all([x is None for x in [file_path, path_type, file_data]]):
+        raise Exception("All input fields to read_geotiff are None")
+    write_to_file: Optional[bytes] = None
+    path: Optional[str] = None
+    if file_data is not None:
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(file_data)
+        #     path = tmpfile.name
+
+        write_to_file = file_data
+    elif file_path is not None and path_type == "url":
+        resp = urllib.request.urlopen(file_path)
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(resp.read())
+        #     path = tmpfile.name
+        write_to_file = resp.read()
+    elif file_path is not None and path_type == "path":
+        path = file_path
+    elif file_path is not None and path_type == "b64_json":
+        image_data = base64.b64decode(file_path)
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(image_data)
+        #     path = tmpfile.name
+        write_to_file = image_data
+    else:
+        raise Exception("Wrong combination of parameters to read_geotiff")
+
+    with tempfile.NamedTemporaryFile() as tmpfile:
+        path_to_use = None
+        if write_to_file:
+            tmpfile.write(write_to_file)
+            path_to_use = tmpfile.name
+        elif path:
+            path_to_use = path
+
+        with rasterio.open(path_to_use) as src:
+            img = src.read()
+            meta = src.meta
+            try:
+                coords = src.lnglat()
+            except Exception:
+                # Cannot read coords
+                coords = None
+
+    return img, meta, coords
+
+
+def load_image(
+    data: Union[list[str]],
+    path_type: str,
+    mean: Optional[list[float]] = None,
+    std: Optional[list[float]] = None,
+    indices: Optional[Union[list[int], None]] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the
+              images in *file_paths*.
+        std: list containing std values for each band in the
+             images in *file_paths*.
+
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+
+    for file in data:
+        # if isinstance(file, bytes):
+        #     img, meta, coords = read_geotiff(file_data=file)
+        # else:
+        img, meta, coords = read_geotiff(file_path=file, path_type=path_type)
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+
+        try:
+            match = re.search(r"(\d{7,8}T\d{6})", file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split("T")[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = (datetime.datetime.strptime(
+                        julian_day, "%m%d").timetuple().tm_yday)
+                temporal_coords.append([year, julian_day])
+        except Exception:
+            logger.exception("Could not extract timestamp for %s", file)
+
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+
+    return imgs, temporal_coords, location_coords, metas
+
+
+class PrithviMultimodalDataProcessor(IOProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.datamodule = Sen1Floods11NonGeoDataModule(
+            data_root=datamodule_config["data_root"],
+            batch_size=datamodule_config["batch_size"],
+            num_workers=datamodule_config["num_workers"],
+            bands=datamodule_config["bands"],
+            drop_last=datamodule_config["drop_last"],
+            test_transform=datamodule_config["test_transform"],
+        )
+        self.img_size = 512
+        self.h1 = 1
+        self.w1 = 1
+        self.original_h = 512
+        self.original_w = 512
+        self.batch_size = 1
+        self.meta_data = None
+        self.requests_cache: dict[str, dict[str, Any]] = {}
+        self.indices = DEFAULT_INPUT_INDICES
+
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        if type(request) is dict:
+            image_prompt = ImagePrompt(**request)
+            return image_prompt
+        if isinstance(request, IOProcessorRequest):
+            if not hasattr(request, "data"):
+                raise ValueError(
+                    "missing 'data' field in OpenAIBaseModel Request")
+
+            request_data = request.data
+
+            if type(request_data) is dict:
+                return ImagePrompt(**request_data)
+            else:
+                raise ValueError("Unable to parse the request data")
+
+        raise ValueError("Unable to parse request")
+
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        return IOProcessorResponse(
+            request_id=plugin_output.request_id,
+            data=plugin_output,
+        )
+
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+
+        image_data = dict(prompt)
+
+        if request_id:
+            self.requests_cache[request_id] = {
+                "out_format": image_data["out_data_format"],
+            }
+
+        input_data, temporal_coords, location_coords, meta_data = load_image(
+            data=[image_data["data"]],
+            indices=self.indices,
+            path_type=image_data["data_format"],
+        )
+
+        self.meta_data = meta_data[0]
+
+        if input_data.mean() > 1:
+            input_data = input_data / 10000  # Convert to range 0-1
+
+        self.original_h, self.original_w = input_data.shape[-2:]
+        pad_h = (self.img_size -
+                 (self.original_h % self.img_size)) % self.img_size
+        pad_w = (self.img_size -
+                 (self.original_w % self.img_size)) % self.img_size
+        input_data = np.pad(
+            input_data,
+            ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
+            mode="reflect",
+        )
+
+        batch = torch.tensor(input_data)
+        windows = batch.unfold(3, self.img_size,
+                               self.img_size).unfold(4, self.img_size,
+                                                     self.img_size)
+        self.h1, self.w1 = windows.shape[3:5]
+        windows = rearrange(
+            windows,
+            "b c t h1 w1 h w -> (b h1 w1) c t h w",
+            h=self.img_size,
+            w=self.img_size,
+        )
+
+        # Split into batches if number of windows > batch_size
+        num_batches = (windows.shape[0] // self.batch_size
+                       if windows.shape[0] > self.batch_size else 1)
+        windows = torch.tensor_split(windows, num_batches, dim=0)
+
+        if temporal_coords:
+            temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
+        else:
+            temporal_coords = None
+        if location_coords:
+            location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
+        else:
+            location_coords = None
+
+        prompts = []
+        for window in windows:
+            # Apply standardization
+            window = self.datamodule.test_transform(
+                image=window.squeeze().numpy().transpose(1, 2, 0))
+            window = self.datamodule.aug(window)["image"]
+            prompts.append({
+                "prompt_token_ids": [1],
+                "multi_modal_data": {
+                    "pixel_values": window.to(torch.float16)[0],
+                    "location_coords": location_coords.to(torch.float16),
+                },
+            })
+
+        return prompts
+
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+
+        pred_imgs_list = []
+
+        if request_id and (request_id in self.requests_cache):
+            out_format = self.requests_cache[request_id]["out_format"]
+        else:
+            out_format = "b64_json"
+
+        for output in model_output:
+            y_hat = output.outputs.data.argmax(dim=1)
+            pred = torch.nn.functional.interpolate(
+                y_hat.unsqueeze(1).float(),
+                size=self.img_size,
+                mode="nearest",
+            )
+            pred_imgs_list.append(pred)
+
+        pred_imgs: torch.Tensor = torch.concat(pred_imgs_list, dim=0)
+
+        # Build images from patches
+        pred_imgs = rearrange(
+            pred_imgs,
+            "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+            h=self.img_size,
+            w=self.img_size,
+            b=1,
+            c=1,
+            h1=self.h1,
+            w1=self.w1,
+        )
+
+        # Cut padded area back to original size
+        pred_imgs = pred_imgs[..., :self.original_h, :self.original_w]
+
+        # Squeeze (batch size 1)
+        pred_imgs = pred_imgs[0]
+
+        if not self.meta_data:
+            raise ValueError("No metadata available for the current task")
+        self.meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+        out_data = save_geotiff(_convert_np_uint8(pred_imgs), self.meta_data,
+                                out_format)
+
+        return ImageRequestOutput(type=out_format,
+                                  format="tiff",
+                                  data=out_data,
+                                  request_id=request_id)
+
+
+class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.indices = [1, 2, 3, 8, 11, 12]
+
+
+class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.indices = [0, 1, 2, 3, 4, 5]
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import albumentations
+from pydantic import BaseModel
+
+
+class DataModuleConfig(TypedDict):
+    bands: list[str]
+    batch_size: int
+    constant_scale: float
+    data_root: str
+    drop_last: bool
+    no_data_replace: float
+    no_label_replace: int
+    num_workers: int
+    test_transform: list[
+        albumentations.core.transforms_interface.BasicTransform]
+
+
+class ImagePrompt(BaseModel):
+
+    data_format: Literal["b64_json", "bytes", "url"]
+    """
+    This is the data type for the input image
+    """
+
+    image_format: str
+    """
+    This is the image format (e.g., jpeg, png, etc.)
+    """
+
+    out_data_format: Literal["b64_json", "url"]
+
+    data: Any
+    """
+    Input image data
+    """
+
+
+MultiModalPromptType = Union[ImagePrompt]
+
+
+class ImageRequestOutput(BaseModel):
+    """
+    The output data of an image request to vLLM. 
+
+    Args:
+        type (str): The data content type [path, object]
+        format (str): The image format (e.g., jpeg, png, etc.)
+        data (Any): The resulting data.
+    """
+
+    type: Literal["path", "b64_json"]
+    format: str
+    data: str
+    request_id: Optional[str] = None
--- a/tests/plugins/prithvi_io_processor_plugin/setup.py
+++ b/tests/plugins/prithvi_io_processor_plugin/setup.py
@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name="prithvi_io_processor_plugin",
+    version="0.1",
+    packages=["prithvi_io_processor"],
+    entry_points={
+        "vllm.io_processor_plugins": [
+            "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india",  # noqa: E501
+            "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia",  # noqa: E501
+        ]
+    },
+)
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.config import VllmConfig
+from vllm.entrypoints.llm import LLM
+from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.plugins.io_processors import get_io_processor
+from vllm.pooling_params import PoolingParams
+
+MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+
+image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
+
+
+def test_loading_missing_plugin():
+    vllm_config = VllmConfig()
+    with pytest.raises(ValueError):
+        get_io_processor(vllm_config, "wrong_plugin")
+
+
+def test_loading_engine_with_wrong_plugin():
+
+    with pytest.raises(ValueError):
+        LLM(
+            model=MODEL_NAME,
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=32,
+            io_processor_plugin="wrong_plugin",
+        )
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+
+    with vllm_runner(
+            model_name,
+            runner="pooling",
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=1,
+            io_processor_plugin="prithvi_to_tiff_valencia",
+    ) as llm_runner:
+        pooler_output = llm_runner.get_llm().encode(
+            img_prompt,
+            pooling_params=pooling_params,
+        )
+    output = pooler_output[0].outputs
+
+    # verify the output is formatted as expected for this plugin
+    assert all(
+        hasattr(output, attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(output.data)
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--skip-tokenizer-init",
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM in CI.
+        "--max-num-seqs",
+        "32",
+        "--io-processor-plugin",
+        "prithvi_to_tiff_valencia"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_prithvi_mae_plugin_online(
+    server: RemoteOpenAIServer,
+    model_name: str,
+):
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": model_name,
+        "softmax": False
+    }
+
+    ret = requests.post(
+        server.url_for("pooling"),
+        json=request_payload_url,
+    )
+
+    response = ret.json()
+
+    # verify the request response is in the correct format
+    assert (parsed_response := IOProcessorResponse(**response))
+
+    # verify the output is formatted as expected for this plugin
+    plugin_data = parsed_response.data
+
+    assert all(
+        plugin_data.get(attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(plugin_data["data"])
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -7,6 +7,15 @@ import torch
 from vllm.plugins import load_general_plugins


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def test_platform_plugins():
    # simulate workload by running an example
    import runpy
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer,
    request = EngineCoreRequest("",
                                prompt_token_ids,
                                None,
-                                None,
-                                None,
                                params,
                                None,
                                None,
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@ -2,12 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
+from collections.abc import Generator
+from typing import Optional

 import pytest

-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage, FunctionCall,
+                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer

 # Use a common model that is likely to be available
 MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
@ -36,6 +41,56 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
        assert actual_tool_call.function == expected_tool_call.function


+def stream_delta_message_generator(
+    xlam_tool_parser: xLAMToolParser,
+    xlam_tokenizer: AnyTokenizer,
+    model_output: str,
+    request: Optional[ChatCompletionRequest] = None,
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = xlam_tokenizer.encode(model_output,
+                                          add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = (detokenize_incrementally(
+             tokenizer=xlam_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         ))
+
+        current_text = previous_text + delta_text
+
+        delta_message = xlam_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=request,
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (previous_tokens +
+                           new_tokens if previous_tokens else new_tokens)
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
 def test_extract_tool_calls_no_tools(xlam_tool_parser):
    model_output = "This is a test"
    extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
@ -51,6 +106,7 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
        "single_tool_with_think_tag",
        "single_tool_with_json_code_block",
        "single_tool_with_tool_calls_tag",
+        "single_tool_with_tool_call_xml_tags",
    ],
    argnames=["model_output", "expected_tool_calls", "expected_content"],
    argvalues=[
@ -118,6 +174,20 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
            ],
            "I'll check the weather for you.",
        ),
+        (
+            """I'll help you check the weather.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I'll help you check the weather.",
+        ),
    ],
 )
 def test_extract_tool_calls(xlam_tool_parser, model_output,
@ -245,3 +315,147 @@ def test_streaming_with_list_structure(xlam_tool_parser):
        assert hasattr(result, "tool_calls")
        assert len(result.tool_calls) == 1
        assert result.tool_calls[0].function.name == "get_current_weather"
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "parallel_tool_calls",
+        "single_tool_with_think_tag",
+        "single_tool_with_json_code_block",
+        "single_tool_with_tool_calls_tag",
+        "single_tool_with_tool_call_xml_tags",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            "",
+        ),
+        (
+            """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "<think>I'll help you with that.</think>",
+        ),
+        (
+            """```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "",
+        ),
+        (
+            """I can help with that.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I can help with that.",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_incremental(
+    xlam_tool_parser,
+    xlam_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    """Verify the XLAM Parser streaming behavior by verifying each chunk is as expected."""  # noqa: E501
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=[])
+
+    chunks = []
+    for delta_message in stream_delta_message_generator(
+            xlam_tool_parser, xlam_tokenizer, model_output, request):
+        chunks.append(delta_message)
+
+    # Should have multiple chunks
+    assert len(chunks) >= 3
+
+    # Should have a chunk with tool header (id, name, type) for the first tool call # noqa: E501
+    header_found = False
+    expected_first_tool = expected_tool_calls[0]
+    for chunk in chunks:
+        if chunk.tool_calls and chunk.tool_calls[0].id:
+            header_found = True
+            assert (chunk.tool_calls[0].function.name ==
+                    expected_first_tool.function.name)
+            assert chunk.tool_calls[0].type == "function"
+            # Arguments may be empty initially or None
+            if chunk.tool_calls[0].function.arguments is not None:
+                # If present, should be empty string initially
+                assert chunk.tool_calls[0].function.arguments == ""
+            break
+    assert header_found
+
+    # Should have chunks with incremental arguments
+    arg_chunks = []
+    for chunk in chunks:
+        if (chunk.tool_calls and chunk.tool_calls[0].function.arguments
+                and chunk.tool_calls[0].function.arguments != ""
+                and chunk.tool_calls[0].index ==
+                0  # Only collect arguments from the first tool call
+            ):
+            arg_chunks.append(chunk.tool_calls[0].function.arguments)
+
+    # Arguments should be streamed incrementally
+    assert len(arg_chunks) > 1
+
+    # Concatenated arguments should form valid JSON for the first tool call
+    full_args = "".join(arg_chunks)
+    parsed_args = json.loads(full_args)
+    expected_args = json.loads(expected_first_tool.function.arguments)
+    assert parsed_args == expected_args
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser):
 def test_supports_kw(callable,kw_name,requires_kw_only,
                     allow_var_kwargs,is_supported):
    assert supports_kw(
-        callable=callable,
-        kw_name=kw_name,
-        requires_kw_only=requires_kw_only,
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
        allow_var_kwargs=allow_var_kwargs
    ) == is_supported

@ -948,6 +948,36 @@ def test_join_host_port():
    assert join_host_port("::1", 5555) == "[::1]:5555"


+def test_json_count_leaves():
+    """Test json_count_leaves function from jsontree utility."""
+    from vllm.utils.jsontree import json_count_leaves
+
+    # Single leaf values
+    assert json_count_leaves(42) == 1
+    assert json_count_leaves("hello") == 1
+    assert json_count_leaves(None) == 1
+
+    # Empty containers
+    assert json_count_leaves([]) == 0
+    assert json_count_leaves({}) == 0
+    assert json_count_leaves(()) == 0
+
+    # Flat structures
+    assert json_count_leaves([1, 2, 3]) == 3
+    assert json_count_leaves({"a": 1, "b": 2}) == 2
+    assert json_count_leaves((1, 2, 3)) == 3
+
+    # Nested structures
+    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
+    assert json_count_leaves(nested_dict) == 3
+
+    nested_list = [1, [2, 3], 4]
+    assert json_count_leaves(nested_list) == 4
+
+    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
+    assert json_count_leaves(mixed_nested) == 4
+
+
 def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -7,7 +7,8 @@ import pytest
 import torch

 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@ -37,17 +38,20 @@ def make_request(
    mm_hashes: Optional[list[str]] = None,
    cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)

    return Request(request_id=request_id,
                   prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                   sampling_params=SamplingParams(max_tokens=17),
                   pooling_params=None,
                   eos_token_id=100,
@ -243,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n():

 def test_free_kv_cache_block_queue_popleft_n():
    blocks = [KVCacheBlock(block_id=i) for i in range(6)]
-    # Create a empty FreeKVCacheBlockQueue with these blocks
+    # Create an empty FreeKVCacheBlockQueue with these blocks
    queue = FreeKVCacheBlockQueue(
        [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
    assert queue.num_free_blocks == 6
@ -597,8 +601,14 @@ def test_unify_kv_cache_configs():
    ]

    unify_kv_cache_configs(need_sort_kv_cache_config)
-    assert need_sort_kv_cache_config[0].num_blocks == 10
-    assert need_sort_kv_cache_config[1].num_blocks == 10
+    sorted_kv_cache_groups = [
+        KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+        KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
+    ]
+    assert (
+        need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups)
+    assert (
+        need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups)

    diff_kv_cache_config = [
        KVCacheConfig(
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -9,7 +9,8 @@ import pytest
 import torch

 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@ -32,17 +33,20 @@ def make_request(
    prompt_logprobs: Optional[int] = None,
    cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)

    return Request(request_id=request_id,
                   prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                   sampling_params=SamplingParams(
                       max_tokens=17, prompt_logprobs=prompt_logprobs),
                   pooling_params=None,
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@ -8,7 +8,8 @@ import torch

 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@ -1308,21 +1309,24 @@ def create_requests_with_priority(
                                     prompt_logprobs=prompt_logprobs)
    requests = []
    for i in range(num_requests):
+        mm_features = []
        if mm_positions is not None:
            mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-        else:
-            mm_position = None
-            mm_kwargs = None
+            for j, position in enumerate(mm_position):
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
        request = Request(
            request_id=f"{i + starting_idx}",
            prompt_token_ids=[i + starting_idx] * num_tokens,
            sampling_params=sampling_params,
            pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=None,
+            mm_features=mm_features if mm_features else None,
            eos_token_id=EOS_TOKEN_ID,
            arrival_time=arrival_times[i],
            priority=priorities[i],
@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
    request = Request(
        request_id="0",
        prompt_token_ids=[0, 1],
-        multi_modal_kwargs=None,
-        multi_modal_hashes=None,
-        multi_modal_placeholders=None,
+        mm_features=None,
        sampling_params=sampling_params,
        pooling_params=None,
        eos_token_id=EOS_TOKEN_ID,
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@ -6,7 +6,8 @@ import torch

 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                         init_none_hash)
@ -139,19 +140,20 @@ def create_requests(
                                     prompt_logprobs=prompt_logprobs)
    requests = []
    for i in range(num_requests):
+        mm_features = []
        if mm_positions is not None:
            mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-            # Dummy hash for each mm item should be unique
-            # since encoder cache tracks entries by hash
-            mm_hashes = [
-                "hash" + str(i) + "_" + str(j) for j in range(len(mm_position))
-            ]
-        else:
-            mm_position = None
-            mm_kwargs = None
-            mm_hashes = None
+            for j, position in enumerate(mm_position):
+                # Dummy hash for each mm item should be unique
+                # since encoder cache tracks entries by hash
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
        prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                            num_tokens)
        request = Request(
@ -159,9 +161,7 @@ def create_requests(
            prompt_token_ids=prompt_token_ids,
            sampling_params=sampling_params,
            pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=mm_hashes,
+            mm_features=mm_features if mm_features else None,
            eos_token_id=EOS_TOKEN_ID,
            block_hasher=block_hasher,
        )
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig):

@fork_new_process_for_each_test
@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
 def test_kv_sharing_fast_prefill(
    monkeypatch: pytest.MonkeyPatch,
    enforce_eager: bool,
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest:
    return EngineCoreRequest(
        request_id=str(uuid.uuid4()),
        prompt_token_ids=PROMPT_TOKENS,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
        sampling_params=SamplingParams(),
        pooling_params=None,
        eos_token_id=None,
@ -308,17 +306,17 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):

        # Schedule Batch 1: (10, req0)
        assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 1
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
        assert scheduler_output.num_scheduled_tokens["0"] == 10
        # num_computed_tokens should have been updated immediately.
        assert engine_core.scheduler.requests[
            req0.request_id].num_computed_tokens == 10

        # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert engine_core.step_with_batch_queue()[0] == {}
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
        assert scheduler_output.num_scheduled_tokens["0"] == 2
        assert scheduler_output.num_scheduled_tokens["1"] == 8
        # num_computed_tokens should have been updated immediately.
@ -327,42 +325,32 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):

        assert engine_core.scheduler.get_num_unfinished_requests() == 2

-        # Batch queue is full. Finish Batch 1.
-        engine_core.step_with_batch_queue()
-
-        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # Finish Batch 1 and schedule Batch 3: (4, req1).
+        # Note that req0 cannot be scheduled
        # because it is in the decoding stage now.
        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
        assert scheduler_output.num_scheduled_tokens["1"] == 4

-        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        # Finish Batch 2. Get first token of req0.
+        # Schedule Batch 4: (1, req0).
        output = engine_core.step_with_batch_queue()[0].get(0)
        assert output is not None
        assert len(output.outputs) == 1
        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-
-        # Schedule Batch 4: (1, req0).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
        assert scheduler_output.num_scheduled_tokens["0"] == 1

-        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
        output = engine_core.step_with_batch_queue()[0].get(0)
        assert output is not None
        assert len(output.outputs) == 1
        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-
-        # Schedule Batch 5: (1, req1).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
        assert scheduler_output.num_scheduled_tokens["1"] == 1

        # Loop until req0 is finished.
-        step = 0
        req_id = 0
        expected_num_tokens = [
            engine_core.scheduler.requests["0"].num_tokens + 1,
@ -370,19 +358,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
        ]
        while engine_core.scheduler.get_num_unfinished_requests() == 2:
            output = engine_core.step_with_batch_queue()[0]
-            if step % 2 == 0:
-                # Even steps consumes an output.
-                assert output is not None
-                assert len(output[0].outputs) == 1
-                if req_id in engine_core.scheduler.requests:
-                    assert engine_core.scheduler.requests[
-                        req_id].num_tokens == expected_num_tokens[req_id]
-                expected_num_tokens[req_id] += 1
-                req_id = (req_id + 1) % 2
-            else:
-                # Odd steps schedules a new batch.
-                assert output is None
-            step += 1
+            # Every step consumes an output.
+            assert output is not None
+            assert len(output[0].outputs) == 1
+            if req_id in engine_core.scheduler.requests:
+                assert engine_core.scheduler.requests[
+                    req_id].num_tokens == expected_num_tokens[req_id]
+            expected_num_tokens[req_id] += 1
+            req_id = (req_id + 1) % 2


@multi_gpu_test(num_gpus=2)
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -52,9 +52,7 @@ def make_request(
    return EngineCoreRequest(
        request_id=str(uuid.uuid4()),
        prompt_token_ids=prompt_tokens_ids,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
        sampling_params=params,
        pooling_params=None,
        eos_token_id=None,
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case():
    prompt_token_ids = [107, 4606, 236787, 107]
    params = SamplingParams(skip_special_tokens=True)
    request = EngineCoreRequest(
-        "test",
-        prompt_token_ids,
-        None,
-        None,
-        None,
-        params,
-        None,
-        None,
-        0.0,
-        None,
+        request_id="test",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
        cache_salt=None,
        data_parallel_rank=None,
    )
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
    requests = [
        EngineCoreRequest(request_id=f"request-{idx}",
                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                          eos_token_id=None,
+                          arrival_time=0,
                          lora_request=None,
                          cache_salt=None,
                          data_parallel_rank=None,
@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
    requests = [
        EngineCoreRequest(request_id=request_id_list[idx],
                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                          eos_token_id=None,
+                          arrival_time=0,
                          lora_request=None,
                          cache_salt=None,
                          data_parallel_rank=None,
@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool,
    request = EngineCoreRequest(
        request_id=request_id,
        prompt_token_ids=prompt_tokens,
-        arrival_time=0,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
        eos_token_id=eos_token_id,
+        arrival_time=0,
        lora_request=None,
        cache_salt=None,
        data_parallel_rank=None,
@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool,
        EngineCoreRequest(
            request_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
            eos_token_id=None,
+            arrival_time=0,
            lora_request=None,
            cache_salt=None,
            data_parallel_rank=None,
@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors):
        EngineCoreRequest(
            request_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
            eos_token_id=None,
+            arrival_time=0,
            lora_request=None,
            cache_salt=None,
            data_parallel_rank=None,
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.platforms.interface import UnspecifiedPlatform
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import processor as processor_mod
+from vllm.v1.engine.processor import Processor
+
+cherry_pil_image = ImageAsset("cherry_blossom").pil_image
+stop_pil_image = ImageAsset("stop_sign").pil_image
+baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
+
+
+# Mock processor for testing
+def _mk_processor(monkeypatch,
+                  *,
+                  mm_cache_gb: float = 4.0,
+                  enable_prefix_caching: bool = True) -> Processor:
+    """
+    Create a Processor instance with minimal configuration suitable for unit
+    tests without accessing external resources.
+    """
+    monkeypatch.setattr(ModelConfig,
+                        "try_get_generation_config",
+                        lambda self: {},
+                        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+    monkeypatch.setattr(UnspecifiedPlatform,
+                        "is_async_output_supported",
+                        classmethod(lambda cls, enforce_eager: True),
+                        raising=True)
+    monkeypatch.setattr(
+        ModelConfig,
+        "verify_async_output_proc",
+        lambda self, parallel_config, speculative_config, device_config: None,
+        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "verify_with_parallel_config",
+                        lambda self, parallel_config: None,
+                        raising=True)
+    monkeypatch.setattr(processor_mod,
+                        "processor_cache_from_config",
+                        lambda vllm_config, mm_registry: None,
+                        raising=True)
+
+    monkeypatch.setattr(VllmConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+
+    model_config = ModelConfig(
+        skip_tokenizer_init=True,
+        max_model_len=128,
+        mm_processor_cache_gb=mm_cache_gb,
+        generation_config="vllm",
+        tokenizer="dummy",
+    )
+
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(
+        mm_cache_gb)  # type: ignore[attr-defined]
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
+        device_config=DeviceConfig(device="cpu"),
+    )
+
+    # Pass tokenizer=None; InputPreprocessor handles None when
+    # skip_tokenizer_init is True.
+    return Processor(vllm_config, tokenizer=None)  # type: ignore[arg-type]
+
+
+def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image]
+        },
+        # Mismatch: 2 items but only 1 uuid provided
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError, match="must have same length as data"):
+        processor.process_inputs(
+            request_id="req-1",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
+        # Two modalities provided in data
+        "multi_modal_data": {
+            "image": [cherry_pil_image],
+            "video": [baby_reading_np_ndarrays]
+        },
+        # Only image uuids provided; video missing should raise
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError,
+                       match="must be provided if multi_modal_data"):
+        processor.process_inputs(
+            request_id="req-2",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+@pytest.mark.parametrize(
+    "mm_cache_gb, enable_prefix_caching",
+    [
+        (4.0, True),  # default behavior
+        (4.0, False),  # prefix caching disabled
+        (0.0, True),  # processor cache disabled
+    ],
+)
+def test_multi_modal_uuids_accepts_none_and_passes_through(
+        monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=mm_cache_gb,
+                              enable_prefix_caching=enable_prefix_caching)
+
+    # Capture the overrides passed to InputPreprocessor.preprocess
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        # Minimal processed inputs for decoder-only flow
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    # Monkeypatch only the bound preprocess method on this instance
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    # Use a consistent two-image scenario across all configurations
+    mm_uuids = {"image": [None, "hash_stop"], "video": None}
+    prompt = {
+        "prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id="req-3",
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    assert captured["mm_hash_overrides"] == mm_uuids
+
+
+def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
+    # When both processor cache is 0 and prefix caching disabled, the
+    # processor builds overrides from request id instead of using user UUIDs.
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=0.0,
+                              enable_prefix_caching=False)
+
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    request_id = "req-42"
+    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
+    prompt = {
+        "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id=request_id,
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    # Expect request-id-based overrides are passed through
+    assert captured["mm_hash_overrides"] == {
+        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
+        "video": [f"{request_id}-video-0"],
+    }
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
                       kwargs: Optional[dict] = None,
                       non_block: bool = False,
                       unique_reply_rank: Optional[int] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
        with open(".marker", "w"):
            ...
        return super().collective_rpc(method, timeout, args, kwargs)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Kevin H. Luu	6e71a2a6c8	Update Dockerfile Signed-off-by: Kevin H. Luu <kevin@anyscale.com>	2025-09-02 13:22:11 -07:00
Thomas Parnell	d328f7894f	[CI] Enable all hf transformers baselines in test_hybrid (#23936 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-09-02 20:15:06 +00:00
Wentao Ye	98aee612aa	[Log] Only Print Profiler Results on Rank 0 (#23370 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-09-02 18:53:34 +00:00
nathan	598bd74cf8	Fix weights loading for Apertus (#24100 ) Signed-off-by: Nathan Ranchin <nranchin@student.ethz.ch>	2025-09-02 18:34:28 +00:00
Mark McLoughlin	2417798471	[Metrics] Deprecate TPOT in favor of ITL (#24110 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-09-02 18:10:10 +00:00
Kyuyeun Kim	9480ae24e3	[Bugfix] Fix packed_factor missing attribute error (#23902 ) Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>	2025-09-02 10:56:31 -07:00
Chenheli Hua	f399182e8c	Run ruff format on a few files. (#24075 ) Signed-off-by: Chenheli Hua <huachenheli@outlook.com>	2025-09-02 17:55:32 +00:00
Kyle Sayers	1c41310584	[Bugfix] Fix transform_config parsing in Compressed Tensors (#23945 ) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>	2025-09-02 13:54:10 -04:00
Jiangyun Zhu	c83c4ff815	[Benchmark] Add support for local hf dataset path in benchmark (#23999 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>	2025-09-02 17:49:16 +00:00
Peter Pan	0e1759cd54	[docs] add SYS_NICE cap & `security-opt` for docker/k8s (#24017 ) Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Li, Jiang <bigpyj64@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-09-02 17:27:20 +00:00
Michael Goin	e66ed3e675	[CI Failure] Skip failing nvfp4 silu test (#23959 ) Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2025-09-02 13:18:15 -04:00
wang.yuqi	e0653f6c0b	[Model] Classification models support logit_bias / sigmoid_normalize (#24031 ) Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-09-02 16:48:57 +00:00
Kyungmin Lee	38ba061f6f	[BugFix] Fix EXAONE4 rotary embeddings (#23918 ) Signed-off-by: lkm2835 <lkm2835@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-09-02 14:40:55 +00:00
Nicolò Lucchesi	0a74e9d0f2	[Gemma3n] Fix audio batching (#24052 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-09-02 22:23:35 +08:00
Christian Berge	8bd5844989	correct LWS deployment yaml (#23104 ) Signed-off-by: cberge908 <42270330+cberge908@users.noreply.github.com>	2025-09-02 12:04:59 +00:00
Aziz	ce30dca5c4	[CI]: reduce HTTP calls inside entrypoints openai tests (#23646 ) Signed-off-by: AzizCode92 <azizbenothman76@gmail.com> Signed-off-by: Aziz <azizbenothman76@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-09-02 10:49:32 +00:00
WeiQing Chen	2f0bab3f26	[Model] Support dp on ViT on GLM-4.5V (#23168 ) Signed-off-by: David Chen <530634352@qq.com>	2025-09-02 10:48:18 +00:00
Didier Durand	fad73be1a5	[Doc]: fix typos in Python comments (#24077 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-09-02 02:38:55 -07:00
Benji Beck	56d04089ef	Migrate Interns1 inputs to TensorSchema (#23510 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-09-02 04:35:45 +00:00
Yan Ma	7be0cb8e9e	[XPU][Feature] fp8 online quantization support for XPU (#23148 ) Signed-off-by: Yan Ma <yan.ma@intel.com> Co-authored-by: Qiming Zhang <qiming1.zhang@intel.com>	2025-09-02 04:06:53 +00:00
Benji Beck	1fa1d6a9a0	Migrate OvisImagePatchInputs to TensorSchema (#22024 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-09-02 12:01:36 +08:00
Maximilien de Bayser	d59c986444	Remove runtime checks based on pooling params (#24051 ) Signed-off-by: Max de Bayser <mbayser@br.ibm.com>	2025-09-02 11:54:37 +08:00
damon	04d0c60770	[Bugfix] Fix the issue that Blip2ForConditionalGeneration' object has… (#24028 ) Signed-off-by: Dazhi Jiang <dazhi_jiang@163.com>	2025-09-02 11:54:20 +08:00
Asaf Joseph Gardin	2b41cbbf03	[V1][Mamba1] - FP32 SSM Kernel Support (#23506 ) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>	2025-09-01 20:53:00 -07:00
Didier Durand	0235103cbb	[Doc]: fix typos in Python comments (#24042 ) Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2025-09-01 19:07:45 -07:00
Lucia Fang	a344a5aa0a	[bugfix]fix MTP hidden states (#24056 ) Signed-off-by: Lu Fang <fanglu@fb.com>	2025-09-01 21:09:37 +00:00
Woosuk Kwon	5685370271	[Chore][V0 Deprecation] Move LogProb to a separate file (#24055 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 12:07:53 -07:00
WeiQing Chen	a0e0efd6bd	[Model] Support DP for ViT on Kimi-VL-A3B-Thinking-2506 (#23817 ) Signed-off-by: Junhong <liujunhong11@huawei.com> Signed-off-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: Junhong <liujunhong11@huawei.com> Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-09-01 16:56:56 +00:00
Christian Pinto	cf91a89dd2	[docs][misc] IOProcessor plugins fixes (#24046 ) Signed-off-by: Christian Pinto <christian.pinto@ibm.com>	2025-09-01 09:17:41 -07:00
Woosuk Kwon	39a22dcaac	[Misc] Minor code simplification for spec decode (#24053 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 08:54:01 -07:00
Julien Debache	41c80698b3	Document multi-proc method selection for profiling (#23802 ) Signed-off-by: jdebache <jdebache@nvidia.com>	2025-09-01 06:28:26 -07:00
Kwai-Keye	7c8271cd1e	[Model]: support KeyeVL-1_5-8B (#23838 ) Signed-off-by: wangruitao <wangruitao@kuaishou.com> Co-authored-by: wangruitao <wangruitao@kuaishou.com>	2025-09-01 03:50:27 -07:00
Kay Yan	3e330fcb21	[Doc]: Fix CPU install docs: force torch-backend=cpu to avoid GPU torchvision errors (#24033 ) Signed-off-by: Kay Yan <kay.yan@daocloud.io>	2025-09-01 03:34:52 -07:00
Nicolò Lucchesi	d46934b229	[Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735 ) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-09-01 18:07:46 +08:00
Didier Durand	107284959a	[Doc]: fix typos in Python comments (#24026 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-09-01 09:38:20 +00:00
Jee Jee Li	dc1a53186d	[Kernel] Update DeepGEMM to latest commit (#23915 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2025-09-01 02:38:04 -07:00
wang.yuqi	55602bb2e6	[Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904 ) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-09-01 08:50:25 +00:00
Isotr0py	d7fbc6ddac	[Misc] Enable V1 FP16 inference on pre-Ampere GPUs (#24022 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 08:12:22 +00:00
Ning Xie	5438967fbc	[Misc] add hash_function doc string (#24014 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-31 23:11:20 -07:00
Code Jesus	422e793fa6	[Bugfix] Add support for `<tool_call>` format in streaming mode for XLAM Tool Parser (#22769 ) Signed-off-by: Devon Peroutky <devon@kindo.ai>	2025-09-01 14:07:54 +08:00
Christian Pinto	1cb39dbcdd	[Misc] IO Processor plugins for pooling models (#22820 ) Signed-off-by: Christian Pinto <christian.pinto@ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com>	2025-08-31 23:07:12 -07:00
Benji Beck	437c3ce026	Migrate Phi4 inputs to TensorSchema (#23471 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-09-01 14:05:59 +08:00
Ning Xie	499b074bfd	[Misc] refactor code by import as for torch._inductor.config (#23677 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-09-01 14:05:42 +08:00
Isotr0py	ff0e59d83a	[CI/Build] Improve Tensor Schema tests speed by avoid engine core initialization (#23357 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-08-31 22:52:20 -07:00
Woosuk Kwon	b55713683c	[Misc] Move fast prefill logic to separate method (#24013 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 05:40:38 +00:00
Jun-Howie	acc1a6e10a	Fix the bug related to loading GPTP INT3 weights. (#23328 ) Signed-off-by: JunHowie <JunHowie@aliyun.com> Co-authored-by: JunHowie <JunHowie@aliyun.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 05:39:57 +00:00
Woosuk Kwon	8c742a66d1	[Misc] Avoid redundant copy for encoder-only models (#24012 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 04:02:43 +00:00
JartX	183a70967a	[BUGFIX] GPTQ quantization compatibility for Qwen3 MOE models (AutoGPTQ and AutoRound-GPTQ) (#23994 ) Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 03:33:40 +00:00
Or Ozeri	14b4326b94	v1: Support KV events from connectors (#19737 ) Signed-off-by: Or Ozeri <oro@il.ibm.com>	2025-09-01 01:13:21 +00:00
Nick Hill	752d2e1c36	[Minor] Fix some random typos in comments (#24009 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-08-31 16:42:17 -07:00
Xiaodong Wang	81eea3d348	vllm fix check on max vocab size (#22471 ) Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me>	2025-08-31 20:57:05 +08:00
Didier Durand	9701352e4b	[Doc]: fix typos in Python comments (#24001 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-08-31 08:21:59 +00:00
Roger Wang	749be00a98	[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-30 18:01:22 -07:00
Gabriel Marinho	5b8077b8ac	Fix wrong truncate_prompt_tokens type hint (#22761 ) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com>	2025-08-30 20:39:38 +00:00
Andy Lo	038e9be4eb	[LoRA] Much faster startup when LoRA is enabled (#23777 ) Signed-off-by: Andy Lo <andy@mistral.ai> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2025-08-30 15:37:39 +00:00
Ning Xie	68a349114f	[Misc] enhance type hint for rearrange return value (#23519 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:43:33 -07:00
Ning Xie	e80bca309e	[Refactor] refactor freezing_value/cuda_event initialize outside try finally (#23758 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:42:25 -07:00
Ning Xie	fb4983e112	[Misc] add reorder_batch AttentionMetadataBuilder (#23798 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:41:45 -07:00
sadegh.shokatian	379ea2823a	Add LoRA support for DeepSeek models (V2, V3, R1-0528) (#23971 ) Signed-off-by: sadeghja1070 <sadegh.ja1070@gmail.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-30 06:40:02 -07:00
Jiangyun Zhu	3a6acad431	[Model] Enable encoder DP for MiniCPM-V (#23948 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-08-30 06:31:26 -07:00
Ning Xie	5490d633ce	[UT] fix unify_kv_cache_configs when kv cache config needs sort (#23843 )	2025-08-30 11:22:14 +00:00
Jee Jee Li	628d00cd7b	[Bugfix] Fix test_lora_resolvers.py (#23984 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-08-30 11:16:11 +00:00
Thomas Parnell	4071c76cf3	[V1] [Hybrid] Move MiniMaxLinearAttention into layers/mamba (#23831 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-30 00:16:15 -07:00
Cyrus Leung	f1bddbd852	[Core] Cleanup TPU model runner for MM (#23894 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-08-30 00:14:58 -07:00
Yong Hoon Shin	9748c5198b	[CI] Fix broken compile tests due to unsupported SiluMul+Nvfp4Quant fusion (#23973 ) Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Co-authored-by: Roger Wang <hey@rogerw.io>	2025-08-30 00:14:43 -07:00
Roger Wang	ee52a32705	[CI] Move testing image from remote URL to S3 (#23980 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-29 21:41:25 -07:00
Xin Yang	8fb85b7bb6	Add routed_scaling_factor to MoE grouped topk (#23123 ) Signed-off-by: Xin Yang <xyangx@amazon.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-29 21:36:48 -07:00
dubejf	5b31cb1781	[Bugfix] Fix --config arg expansion called from api_server.py (#23944 ) Signed-off-by: Jean-Francois Dube <dubejf+gh@gmail.com> Co-authored-by: Jean-Francois Dube <dubejf+gh@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-29 21:36:39 -07:00
Roger Wang	d660c98c1b	[CI] Fix unavailable image remote URL (#23966 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-29 15:40:04 -07:00
Harry Mellor	5674a40366	[Misc] Make `download_weights_from_hf` more reliable (#23863 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-08-29 12:37:24 -07:00
Yong Hoon Shin	8c3e199998	Revert gemma3n fast prefill changes (#23897 ) Signed-off-by: Yong Hoon Shin <yhshin@meta.com>	2025-08-29 12:16:57 -07:00
Thomas Parnell	1c26b42296	[Docs] [V1] [Hybrid] Add new documentation re: contributing mamba-based models (#23824 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-08-29 18:47:58 +00:00
Michael Goin	b7adf94c4a	Tuned H100/H200 triton fp8 block configs for fused_qkv_a_proj (#23939 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-08-29 10:28:35 -07:00
22quinn	4d7fe40fc0	[RL][BugFix] Fix missing tokenizer error for token-in-token-out (#23904 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-08-30 01:09:55 +08:00
yzds	0dc9532065	[BUGFIX ] fix undefined silu_and_mul_nvfp4_quant (#23929 ) Signed-off-by: hongchao <hongchao@msh.team> Signed-off-by: Richard Zou <zou3519@gmail.com> Co-authored-by: hongchao <hongchao@msh.team> Co-authored-by: Richard Zou <zou3519@gmail.com> Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>	2025-08-29 09:36:39 -07:00
vllmellm	72a69132dc	[CI] Add `aiter` to matching list of issue auto labeller for `rocm` tag (#23942 ) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>	2025-08-29 15:29:21 +00:00
Nick Hill	d90d8eb674	[BugFix] Async scheduling and PP compatibility with DP (#23770 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-08-29 08:17:27 -07:00
Lukas Geiger	0a2f4c0793	[Models] Use in-place adds in Idefics2Vision (#23932 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-08-29 07:42:57 -07:00
EduardDurech	1cf3753b90	[MODEL] `Apertus` and `XIELU` (#23068 ) Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com> Co-authored-by: AllenHaoHuang <allenhuangdd@gmail.com>	2025-08-29 20:29:18 +08:00
Adit Chawdhary	4f7cde7272	Adds `json_count_leaves` utility function (#23899 ) Signed-off-by: aditchawdhary <aditxy@hotmail.com>	2025-08-29 05:28:13 -07:00
Huy Do	67c14906aa	Update PyTorch to 2.8.0 (#20358 ) Signed-off-by: Huy Do <huydhn@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-08-29 18:57:35 +08:00
Flora Feng	69f46359dd	[Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779 ) Signed-off-by: sfeng33 <4florafeng@gmail.com>	2025-08-29 18:36:57 +08:00
wang.yuqi	d9e00dbd1f	[Performance] V1 Classify Models E2E Performance Optimization (#23541 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-08-29 03:12:32 -07:00
Li, Jiang	ad39106b16	[CPU] Enable data parallel for CPU backend (#23903 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-08-29 02:19:58 -07:00