fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
wip
2025-09-01 18:58:00 -07:00 · 2025-09-01 09:32:49 -07:00 · 2025-09-01 08:54:01 -07:00 · 2025-09-01 06:28:26 -07:00 · 2025-09-01 03:50:27 -07:00 · 2025-09-01 03:34:52 -07:00
395 changed files with 11544 additions and 5025 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -62,12 +62,8 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build release image (x86)"
-    depends_on: ~
-    key: block-release-image-build
-
  - label: "Build release image (x86)"
-    depends_on: block-release-image-build
+    depends_on: ~
    id: build-release-image-x86
    agents:
      queue: cpu_queue_postmerge
@ -80,7 +76,7 @@ steps:
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build release image (arm64)"
-    depends_on: block-release-image-build
+    depends_on: ~
    id: build-release-image-arm64
    agents:
      queue: arm64_cpu_queue_postmerge
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
@ -49,23 +49,23 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

    # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py

-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@ -73,33 +73,49 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"

-  # online serving
+  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
-      --endpoint /v1/completions'
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -109,10 +109,9 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Test (API Server) # 40min
@ -234,7 +233,26 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: V1 Test
+- label: V1 Test e2e + engine
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: V1 Test others
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -242,8 +260,6 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/executor
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
@ -256,9 +272,6 @@ steps:
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -312,7 +325,7 @@ steps:
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

 - label: PyTorch Compilation Unit Tests
@ -449,8 +462,8 @@ steps:
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
@ -553,8 +566,7 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing

 - label: Multi-Modal Models Test (Standard)
  mirror_hardwares: [amdexperimental]
@ -654,6 +666,7 @@ steps:
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@ -663,6 +676,7 @@ steps:
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -755,6 +769,11 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y 
+  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -791,13 +810,14 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_multi_loras_with_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py


 - label: Weight Loading Multiple GPU Test  # 33min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_gpus: 2 
+  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -0,0 +1,21 @@
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     # min_available defaults to the global cfg in the ALI Terraform
+#     min_available: undefined
+#     # when max_available value is not defined, no max runners is enforced
+#     max_available: undefined
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: true
+    os: linux
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -49,6 +49,10 @@ jobs:
                    term: "VLLM_ROCM_",
                    searchIn: "both"
                  },
+                  {
+                    term: "aiter",
+                    searchIn: "title"
+                  },
                  {
                    term: "rocm",
                    searchIn: "title"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

 #
 # Try to find python package with an executable that exactly matches
@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+    (512 + 64, 7168),
+    (2112, 7168),
+    ((128 + 64) * 128, 7168),
+    (128 * (128 + 128), 512),
+    (7168, 16384),
+    (7168, 18432),
+    (18432 * 2, 7168),
+    (24576, 1536),
+    (12288, 7168),
+    (4096, 7168),
+    (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+    """Build runner function for w8a8 block fp8 matmul."""
+    factor_for_scale = 1e-2
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    # Create random FP8 tensors
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Create scales
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+        * factor_for_scale
+    )
+
+    def run():
+        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+    return run
+
+
+@vllm_triton.testing.perf_report(
+    vllm_triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_names=["torch-bf16", "w8a8-block-fp8"],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+    M = batch_size
+    device = "cuda"
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:  # w8a8-block-fp8
+        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+    block_size = (128, 128)
+
+    for N, K in DEEPSEEK_V3_SHAPES:
+        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+        print(f"TFLOP/s comparison (block_size={block_size}):")
+        benchmark_tflops.run(
+            print_data=True,
+            # show_plots=False,
+            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+            N=N,
+            K=K,
+            block_size=block_size,
+        )
+
+    print("\nBenchmark finished!")
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -419,8 +419,10 @@ class BenchmarkWorker:
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
+        block_n = block_quant_shape[0] if block_quant_shape else None
+        block_k = block_quant_shape[1] if block_quant_shape else None
        op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str
+            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
        )
        if op_config is None:
            config = get_default_config(
@ -430,6 +432,7 @@ class BenchmarkWorker:
                hidden_size,
                topk,
                dtype_str,
+                block_quant_shape,
            )
        else:
            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
    # cannot TP
    total = [
        (512 + 64, 7168),
+        (2112, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

+void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                                   torch::Tensor& cp_local_token_select_indices,
+                                   torch::Tensor& kv_cache,
+                                   torch::Tensor& slot_mapping,
+                                   const std::string& kv_cache_dtype,
+                                   torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache(
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt);
+    std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,6 +1,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>

 #include "cuda_utils.h"
 #include "cuda_compat.h"
@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void cp_fused_concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
+    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
+  const int64_t slot_idx = slot_mapping[blockIdx.x];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -508,6 +554,20 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
+  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          cp_local_token_select_indices.data_ptr<int64_t>(),            \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -546,6 +606,50 @@ void concat_and_cache_mla(
                             CALL_CONCAT_AND_CACHE_MLA);
 }

+// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
+// calls into one:
+// k_c_normed.index_select(0, cp_local_token_select_indices) + \
+// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
+// concat_and_cache_mla.
+void cp_fused_concat_and_cache_mla(
+    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
+    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {

 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@ -779,3 +883,145 @@ void gather_and_maybe_dequant_cache(

  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
+
+namespace vllm {
+template <typename scalar_t>
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRY_SIZE]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
+                                            // batch
+) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on it
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  if (seq_starts != nullptr) {
+    offset += seq_starts[bid];
+  }
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * dst_entry_stride;
+    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+    offset += 1;
+    // bump to next block
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
+  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting slot index by
+//  seq_starts[bid]
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_CP_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_CP_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_CP_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -19,6 +19,13 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

+#define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
+
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
@ -45,6 +52,15 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))

+#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
+  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))

--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);

+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant(torch::Tensor& out,
+                              torch::Tensor& output_block_scale,
+                              torch::Tensor& input,
+                              torch::Tensor& input_global_scale);
+#endif
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+
+namespace vllm {
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = c10::Half;
+};
+
+template <>
+struct TypeConverter<c10::Half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = c10::BFloat16;
+};
+
+template <>
+struct TypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
+                                                   PackedVec<Type>& vec2) {
+  PackedVec<Type> result;
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      half2 val(0.5f, 0.5f);
+      half2 t0 = __hmul2(vec.elts[i], val);
+      half2 t1 = __hfma2(h2tanh(t0), val, val);
+      half2 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    } else {
+      __nv_bfloat162 val(0.5f, 0.5f);
+      __nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
+      __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
+      __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    }
+  }
+  return result;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
+                                                  PackedVec<Type>& vec2,
+                                                  float SFScaleVal,
+                                                  uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  PackedVec<Type> out_silu = compute_silu(vec, vec2);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(out_silu.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      fp2Vals[i] = __half22float2(out_silu.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
+#else
+silu_and_cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset =
+          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
+                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
+
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      ;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
+          in_vec, in_vec2, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+}  // namespace vllm
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
+                              torch::Tensor& output_sf,
+                              torch::Tensor& input,  // [..., 2 * d]
+                              torch::Tensor& input_sf) {
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  int32_t m = input.size(0);
+  int32_t n = input.size(1) / 2;
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
+        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
+        VLLM_DISPATCH_BYTE_TYPES(
+            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
+            [&] {
+              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
+                  <<<grid, block, 0, stream>>>(
+                      m, n, input_ptr, input_sf_ptr,
+                      reinterpret_cast<uint32_t*>(output_ptr),
+                      reinterpret_cast<uint32_t*>(sf_out));
+            });
+      });
+}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -115,6 +115,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  ops.def(
+      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
+      "Tensor input, Tensor input_global_scale) -> ()");
+  ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant);
+#endif
+
  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);

@ -686,6 +694,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);

+  cache_ops.def(
+      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                              Tensor cp_local_token_select_indices,"
+      "                              Tensor! kv_cache,"
+      "                              Tensor slot_mapping,"
+      "                              str kv_cache_dtype,"
+      "                              Tensor scale) -> ()");
+  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
+                 &cp_fused_concat_and_cache_mla);
+
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
@ -702,6 +720,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                               Tensor scale, Tensor? seq_starts) -> ()");
  cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
                 &gather_and_maybe_dequant_cache);
+
+  cache_ops.def(
+      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -432,11 +432,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
-    && rm /tmp/install_deepgemm.sh
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 

 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models:

 - Llama4 (<gh-pr:18368>)
- MiniCPM-V-4 (<gh-pr:23327>)
+- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)

--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:

-1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
+1. Increase the timeout limit to 10 hours so that the build doesn't time out.
 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
 to warm it up so that future builds are faster.

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).

 With these two steps, interleave sliding windows should work with the model.
+
+### How to support models that use Mamba?
+
+We consider 3 different scenarios:
+
+1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
+2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
+3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
+
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
+For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
+V0-only classes and code will be removed in the very near future.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
+
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+Please follow the same guidelines as case (2) for implementing these models.
+We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
+It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
+Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
+Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -855,7 +855,7 @@ Examples:

 ### Custom HF processor

-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].

 Examples:

--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -73,6 +73,8 @@ apt install nsight-systems-cli

 ### Example commands and usage

+When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+
 #### Offline Inference

 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy

 One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.

-It supports vLLM as a AI model provider to efficiently serve large language models.
+It supports vLLM as an AI model provider to efficiently serve large language models.

 For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -380,7 +380,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)

 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"

-If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
+If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:

 1. container log contains "KeyboardInterrupt: terminated"
 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C

 #### Step 1: Add an All2All manager

-The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).

 #### Step 2: Add a FusedMoEPrepareAndFinalize Type

--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -0,0 +1,78 @@
+# IO Processor Plugins
+
+IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+
+## Writing an IO Processor Plugin
+
+IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+
+```python
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
+```
+
+The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
+The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
+The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+
+## Using an IO Processor plugin
+
+IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
+
+1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
+2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
+
+The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201

 ### Multi-process Mode

-In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.

 ### Built in Python/Process Metrics

-The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
+The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:

 - `python_gc_objects_collected_total`
 - `python_gc_objects_uncollectable_total`
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -49,6 +49,8 @@ Every plugin has three parts:

 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+
 ## Guidelines for Writing Plugins

 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -52,7 +52,7 @@ Check out <gh-file:examples/offline_inference/multilora_inference.py> for an exa
 ## Serving LoRA Adapters

 LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:

 ```bash
 vllm serve meta-llama/Llama-2-7b-hf \
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].

+### Stable UUIDs for Caching (multi_modal_uuids)
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ### Image Inputs

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att
            print(content, end="", flush=True)
    ```

-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

 ## Tool Calling

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr

 At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).

-For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct`

 Here is a simple example demonstrating how to get structured output using Pydantic models:

--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you

 - `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
  compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
+  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
  under this specified path.
 - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
 - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).

@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`

-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.

 ### Which quantization configs does vLLM CPU support?

--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@ -1,6 +1,6 @@
 # --8<-- [start:installation]

-vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
+vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS.

 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.

--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@ -16,8 +16,8 @@ cd vllm_source
 Third, install required dependencies:

 ```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend auto
-uv pip install -r requirements/cpu.txt --torch-backend auto
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```

 ??? console "pip"
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -43,7 +43,7 @@ docker build -f docker/Dockerfile.cpu \

 # Launching OpenAI server
 docker run --rm \
-            --privileged=true \
+            --security-opt seccomp=unconfined \
            --shm-size=4g \
            -p 8000:8000 \
            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE

 #### Install the latest code

-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.

 ```bash
 uv pip install -U vllm \
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@ -149,7 +149,7 @@ Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.

-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```json
 {
@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
 #### Build an image with vLLM

 First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```bash
 {
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -258,4 +258,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```

-A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -40,7 +40,7 @@ If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it mean

 #### Custom models

-If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
+If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!

 For a model to be compatible with the Transformers backend for vLLM it must:

@ -335,9 +335,9 @@ th {
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
@ -358,7 +358,7 @@ th {
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
 | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
-| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
 | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -497,6 +497,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  | ✅︎ |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
@ -513,6 +514,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
    vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
    ```

+!!! note
+    The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
+
 !!! note
    Load the official original `mxbai-rerank-v2` by using the following command.

@ -630,7 +634,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@ -51,7 +51,7 @@ tail ~/.config/vllm/usage_stats.json

 ## Opting out

-You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
+You can opt out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:

 ```bash
 # Any of the following methods can disable usage stats collection
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models

 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
-Please note that prefix caching is not yet supported for these models.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.

-Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
-Please note that prefix caching is not yet supported for these models.

-Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
-It is also necessary to enforce eager mode for these models in V1.
+Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
+
+Please note that prefix caching is not yet supported for any of the above models.

 #### Encoder-Decoder Models

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -23,7 +23,7 @@ def create_test_prompts(
    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
-    with the second LoRA adapter will be ran after all requests with the
+    with the second LoRA adapter will be run after all requests with the
    first adapter have finished.
    """
    return [
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import os
+
+import torch
+
+from vllm import LLM
+from vllm.pooling_params import PoolingParams
+
+# This example shows how to perform an offline inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirement - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+
+
+def main():
+    torch.set_default_dtype(torch.float16)
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    llm = LLM(
+        model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        skip_tokenizer_init=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM.
+        # The maximum number depends on the available GPU memory
+        max_num_seqs=32,
+        io_processor_plugin="prithvi_to_tiff_india",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooler_output = llm.encode(
+        img_prompt,
+        pooling_params=pooling_params,
+    )
+    output = pooler_output[0].outputs
+
+    print(output)
+    decoded_data = base64.b64decode(output.data)
+
+    file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
+    with open(file_path, "wb") as f:
+        f.write(decoded_data)
+
+    print(f"Output file path: {file_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -138,7 +138,7 @@ def main():
    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
    if not args.custom_mm_prompts:
        outputs = llm.generate(
-            TokensPrompt(prompt_token_ids=prompt_ids),
+            [TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
            sampling_params=sampling_params,
        )
    else:
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Keye-VL-1.5
+def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1648,6 +1679,7 @@ model_example_map = {
    "interns1": run_interns1,
    "internvl_chat": run_internvl,
    "keye_vl": run_keye_vl,
+    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
    "llama4": run_llama4,
    "llava": run_llava,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

@ -1209,6 +1246,7 @@ model_example_map = {
    "interns1": load_interns1,
    "internvl_chat": load_internvl,
    "keye_vl": load_keye_vl,
+    "keye_vl1_5": load_keye_vl1_5,
    "kimi_vl": load_kimi_vl,
    "llama4": load_llama4,
    "llava": load_llava,
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
    token_ids: list[int]
    block_size: int
    lora_id: Optional[int]
+    medium: Optional[str]


 class BlockRemoved(KVCacheEvent):
    block_hashes: list[int]
+    medium: Optional[str]


 class AllBlocksCleared(KVCacheEvent):
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+
+import requests
+
+# This example shows how to perform an online inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirements :
+# - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - start vllm in serving mode with the below args
+#   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --task embed --trust-remote-code
+#   --skip-tokenizer-init --enforce-eager
+#   --io-processor-plugin prithvi_to_tiff_india
+
+
+def main():
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    server_endpoint = "http://localhost:8000/pooling"
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+    }
+
+    ret = requests.post(server_endpoint, json=request_payload_url)
+
+    print(f"response.status_code: {ret.status_code}")
+    print(f"response.reason:{ret.reason}")
+
+    response = ret.json()
+
+    decoded_image = base64.b64decode(response["data"]["data"])
+
+    out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
+
+    with open(out_path, "wb") as f:
+        f.write(decoded_image)
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<80.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.7.1",
+    "torch == 2.8.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,7 +4,8 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.1
+torch==2.8.0
 wheel
 jinja2>=3.1.6
 regex
+build
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -9,17 +9,16 @@ packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le"
-torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
+torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"

 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts

 # Intel Extension for PyTorch, only for x86_64 CPUs
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.1
-torchaudio==2.7.1
+torch==2.8.0
+torchaudio==2.8.0
 # These must be updated alongside torch
-torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0

 triton==3.3.0
 cmake>=3.26.1,<4
--- a/requirements/test.in
+++ b/requirements/test.in
@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -541,42 +541,42 @@ numpy==1.26.4
    #   tritonclient
    #   vocos
    #   xarray
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
    # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
    # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
    # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
    # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
    # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
    # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
    # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
    # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
    # via
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
    # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
    # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvtx-cu12==12.8.90
    # via torch
 omegaconf==2.3.0
    # via
@ -1069,7 +1069,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.7.1+cu128
+torch==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   accelerate
@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.8.0+cu128
    # via
    #   -r requirements/test.in
    #   encodec
@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.22.1+cu128
+torchvision==0.23.0+cu128
    # via
    #   -r requirements/test.in
    #   lightly
@ -1152,7 +1152,7 @@ transformers==4.55.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.3.1
+triton==3.4.0
    # via torch
 tritonclient==2.51.0
    # via
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@ -4,32 +4,41 @@ import pytest
 import torch

 import vllm.envs as envs
-from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.compilation.activation_quant_fusion import (
+    FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
+# yapf: enable
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    Fp8LinearOp)
 from vllm.platforms import current_platform

 from .backend import TestBackend

+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8

-class TestModel(torch.nn.Module):

-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
+def is_nvfp4_supported():
+    return current_platform.has_device_capability(100)
+
+
+class TestSiluMulFp8QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+        super().__init__()
        self.silu_and_mul = SiluAndMul()
        self.wscale = torch.rand(1, dtype=torch.float32)
        self.scale = torch.rand(1, dtype=torch.float32)

-        self.w = (torch.rand(
-            hidden_size,
-            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()

        self.fp8_linear = Fp8LinearOp(
            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
@ -45,14 +54,56 @@ class TestModel(torch.nn.Module):
                                   input_scale=self.wscale)
        return x2

+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]

-@pytest.mark.parametrize("num_tokens", [256])
-@pytest.mark.parametrize("hidden_size", [64])
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kFp8StaticTensorSym]]
+
+
+class TestSiluMulNvfp4QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
+                               dtype=FP4_DTYPE)
+        self.wscale = torch.randn(hidden_size,
+                                  hidden_size // 16).to(dtype=FP8_DTYPE)
+        self.wscale2 = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        out = cutlass_scaled_fp4_mm(a=y_quant,
+                                    b=self.w,
+                                    block_scale_a=y_block_scale,
+                                    block_scale_b=self.wscale,
+                                    alpha=self.scale * self.wscale2,
+                                    out_dtype=y.dtype)
+        return out
+
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
+
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kNvfp4Quant]]
+
+
+@pytest.mark.parametrize("num_tokens", [64])
+@pytest.mark.parametrize("hidden_size", [128])
+@pytest.mark.parametrize(
+    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                    reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
                                   force_fp8_e4m3fnuz):
+    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+        pytest.skip("Duplicate tests for NVFP4")
+
    torch.set_default_device("cuda")
    torch.set_default_dtype(torch.float16)

@ -63,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
    fusion_pass = ActivationQuantFusionPass(config)

    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, force_fp8_e4m3fnuz)
+    model = model_class(hidden_size=hidden_size,
+                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)

    # First dimension dynamic
    x = torch.rand(num_tokens, hidden_size * 2)
@ -80,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
                               atol=1e-3,
                               rtol=1e-3)

-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
+    # In pre-nodes, quant op should be present and fused kernels should not
+    backend.check_before_ops(model.ops_in_model_before())

-    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
-
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
-
-    # In post-nodes, fused kernels should be present and fp8 quant should not
-    find_auto_fn(post_nodes, silu_and_mul_quant)
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+    # In post-nodes, fused kernels should be present and quant op should not
+    backend.check_after_ops(model.ops_in_model_after())
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+import math
 import os
 import tempfile
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast

 import numpy as np
 import pytest
@ -33,6 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect

 logger = init_logger(__name__)
@ -454,11 +456,10 @@ class HfRunner:
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
+        problem_type = getattr(self.config, "problem_type", "")
+
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
-
-            problem_type = getattr(self.config, "problem_type", "")
-
            if problem_type == "regression":
                logits = output.logits[0].tolist()
            elif problem_type == "multi_label_classification":
@ -602,7 +603,7 @@ class HfRunner:
    def _hidden_states_to_logprobs(
        self,
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: int,
+        num_logprobs: Optional[int],
    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)
@ -630,7 +631,7 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
@ -677,7 +678,7 @@ class HfRunner:
        self,
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> list[TokensTextLogprobs]:
@ -966,7 +967,7 @@ class VllmRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        num_prompt_logprobs: Optional[int] = None,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
@ -991,11 +992,40 @@ class VllmRunner:
                                        videos=videos,
                                        **kwargs)

+    def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
+        """
+        Return the perplexity score associated with generating the prompts
+
+        :param prompts: list of prompts to score
+        :return: perplexity score of each prompt
+        """
+        outputs = self.generate_greedy_logprobs(prompts,
+                                                max_tokens=1,
+                                                num_logprobs=None,
+                                                num_prompt_logprobs=0)
+
+        perplexities = []
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+
+            perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
+            perplexities.append(perplexity)
+
+        return perplexities
+
    def generate_encoder_decoder_greedy_logprobs(
        self,
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
        num_prompt_logprobs: Optional[int] = None,
        skip_special_tokens: bool = True,
    ) -> Union[list[TokensTextLogprobs],
@ -1090,6 +1120,9 @@ class VllmRunner:

        return self.llm.llm_engine.collective_rpc(_apply_model)

+    def get_llm(self) -> LLM:
+        return self.llm
+
    def __enter__(self):
        return self

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -118,6 +118,8 @@ class PPTestSettings:
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
+        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
+
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
@ -126,7 +128,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
-            vllm_major_versions=["0"],
+            vllm_major_versions=vllm_major_versions,
            runner=runner,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = {
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
+    # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
+    # is fixed
+    #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
        load_format="dummy", runner="pooling"
    ),
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -292,7 +292,7 @@ SP_TEST_MODELS = [
    # TODO support other models
    # [LANGUAGE GENERATION]
    "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]


--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 prompts = ["The chef prepared a delicious meal."]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
@ -70,3 +62,9 @@ def test_encode_api(llm: LLM):
    err_msg = "pooling_task must be one of.+"
    with pytest.raises(ValueError, match=err_msg):
        llm.encode(prompts, use_tqdm=False)
+
+
+def test_score_api(llm: LLM):
+    err_msg = "Score API is only enabled for num_labels == 1."
+    with pytest.raises(ValueError, match=err_msg):
+        llm.score("ping", "pong", use_tqdm=False)
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@ -27,14 +27,6 @@ TOKEN_IDS = [
 ]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@ -1,80 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import weakref
-
-import pytest
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-PROMPTS = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def llm(request, monkeypatch_module):
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
-
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              tensor_parallel_size=1,
-              max_model_len=8192,
-              enable_lora=True,
-              max_loras=4,
-              max_lora_rank=64,
-              max_num_seqs=128,
-              enforce_eager=True)
-
-    yield weakref.proxy(llm)
-
-    del llm
-
-    cleanup_dist_env_and_memory()
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.mark.skip_global_cleanup
-def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
-    lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
-        for idx in range(len(PROMPTS))
-    ]
-    # Multiple SamplingParams should be matched with each prompt
-    outputs = llm.generate(PROMPTS, lora_request=lora_request)
-    assert len(PROMPTS) == len(outputs)
-
-    # Exception raised, if the size of params does not match the size of prompts
-    with pytest.raises(ValueError):
-        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
-
-    # Single LoRARequest should be applied to every prompt
-    single_lora_request = lora_request[0]
-    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
-    assert len(PROMPTS) == len(outputs)
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward"
 prompts = ["The chef prepared a delicious meal."]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@ -14,14 +14,6 @@ from ...models.utils import softmax
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -32,15 +32,16 @@ MODEL_CONFIGS = [
        "tensor_parallel_size": 1,
        "tokenizer_mode": "mistral",
    },
-    {
-        "model": "sentence-transformers/all-MiniLM-L12-v2",
-        "enforce_eager": True,
-        "gpu_memory_utilization": 0.20,
-        "max_model_len": 64,
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 64,
-        "tensor_parallel_size": 1,
-    },
+    # TODO: re-enable once these tests are run with V1
+    # {
+    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.20,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    # },
 ]


--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
    return latency, num_output_tokens, transcription.text


-async def bound_transcribe(model_name, sem, client, audio, reference):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+async def bound_transcribe(sem, client, tokenizer, audio, reference):
    # Use semaphore to limit concurrent requests.
    async with sem:
        result = await transcribe_audio(client, tokenizer, *audio)
@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
 async def process_dataset(model, client, data, concurrent_request):
    sem = asyncio.Semaphore(concurrent_request)

+    # Load tokenizer once outside the loop
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
    # Warmup call as the first `librosa.load` server-side is quite slow.
    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
-    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+    _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")

    tasks: list[asyncio.Task] = []
    for sample in data:
        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
        task = asyncio.create_task(
-            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+            bound_transcribe(sem, client, tokenizer, (audio, sr),
+                             sample["text"]))
        tasks.append(task)
    return await asyncio.gather(*tasks)

--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
        },
    )
    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": model_name,
+            "text_1": "ping",
+            "text_2": "pong",
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": "ping",
+            "documents": ["pong"],
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@ -27,6 +27,28 @@ def serve_parser():
    return make_arg_parser(parser)


+### Test config parsing
+def test_config_arg_parsing(serve_parser, cli_config_file):
+    args = serve_parser.parse_args([])
+    assert args.port == 8000
+    args = serve_parser.parse_args(['--config', cli_config_file])
+    assert args.port == 12312
+    args = serve_parser.parse_args([
+        '--config',
+        cli_config_file,
+        '--port',
+        '9000',
+    ])
+    assert args.port == 9000
+    args = serve_parser.parse_args([
+        '--port',
+        '9000',
+        '--config',
+        cli_config_file,
+    ])
+    assert args.port == 9000
+
+
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
    # Test old format: name=path
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@ -24,14 +24,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 DTYPE = "bfloat16"


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def server():
    args = [
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -47,6 +47,7 @@ class MockModelConfig:
    allowed_local_media_path: str = ""
    encoder_config = None
    generation_config: str = "auto"
+    skip_tokenizer_init: bool = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse

 from ...utils import RemoteOpenAIServer

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 MODELS = [
    {
        "name": "BAAI/bge-reranker-v2-m3",
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+
+
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+
+        # Verify all fields are present
+        assert (completion.choices[0].token_ids is not None
+                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert completion.choices[0].prompt_token_ids is not None
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "openai/whisper-large-v3-turbo"
@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]


-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
@pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
        assert out_usage["seconds"] == 16, out_usage["seconds"]


+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
@pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
    # text to text model
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -12,32 +12,24 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "openai/whisper-small"
 SERVER_ARGS = ["--enforce-eager"]


-@pytest.fixture
-def foscolo():
-    # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module",
+                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param


@pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
    async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name


@pytest.mark.asyncio
@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):

 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
-async def test_basic_audio(foscolo, client):
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="text",
-        # TODO remove once language detection is implemented
-        extra_body=dict(language="it"),
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert "greek sea" in out


@pytest.mark.asyncio
-async def test_audio_prompt(foscolo, client):
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    transcription = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        prompt=prompt,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(transcription)['text']
@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):


@pytest.mark.asyncio
-async def test_streaming_response(foscolo, client, server):
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
    translation = ""
    res_no_stream = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="json",
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en", seed=42),
        temperature=0.0)
+
    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "temperature": 0.0,
+        "seed": 42,
    }
    foscolo.seek(0)
    async with httpx.AsyncClient() as http_client:
@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
                text = chunk["choices"][0].get("delta", {}).get("content")
                translation += text or ""

-    assert translation == res_no_stream.text
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert sum([
+        x == y for x, y in zip(res_stream, res_no_stream.text.split())
+    ]) >= len(res_stream) * 0.9


@pytest.mark.asyncio
-async def test_stream_options(foscolo, client, server):
+async def test_stream_options(foscolo, server):
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "stream_include_usage": True,
        "stream_continuous_usage_stats": True,
@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):


@pytest.mark.asyncio
-async def test_long_audio_request(foscolo, client):
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=buffer,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -790,6 +790,78 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
    torch.testing.assert_close(dst, expected)


+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [1024])
+@pytest.mark.parametrize("max_seq_len", [512])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype",
+                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
+                             num_blocks, max_seq_len, batch_size, dtype,
+                             kv_cache_dtype, device):
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+    _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
+
+    seq_len_tensor = torch.randint(0,
+                                   max_seq_len + 1, (batch_size, ),
+                                   device=device)
+
+    total_tokens = seq_len_tensor.sum()
+    cu_seq_lens = torch.empty((batch_size + 1),
+                              dtype=torch.int32,
+                              device=device)
+    cu_seq_lens[0] = 0
+    cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    print("seq_len_tensor", seq_len_tensor)
+
+    tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
+    block_table = torch.empty((batch_size, num_blocks),
+                              dtype=torch.int32,
+                              device=device)
+
+    for b in range(batch_size):
+        perm = torch.randperm(num_blocks, device=device)
+        block_table[b, :] = perm
+
+    dst = torch.zeros((total_tokens, entry_size),
+                      dtype=src_cache.dtype,
+                      device=device)
+
+    expected_batches = []
+    for b in range(batch_size):
+        s = seq_len_tensor[b]
+        if s == 0:
+            continue
+        tot = tot_blocks_tensor[b]
+        blocks = block_table[b, :tot].tolist()
+
+        gathered_rows = []
+        for i in range(tot - 1):
+            gathered_rows.append(src_cache[blocks[i]])
+        remaining = s - (tot - 1) * block_size
+        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+
+        batch_expected = torch.cat(gathered_rows, dim=0)
+        expected_batches.append(batch_expected)
+    expected = torch.cat(expected_batches, dim=0)
+
+    opcheck(
+        torch.ops._C_cache_ops.cp_gather_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.cp_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    torch.testing.assert_close(dst, expected)
+
+
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+BLOCK_SIZE = 16
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    assert not current_platform.is_rocm()
+    assert silu_and_mul_out.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
+    other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
+    silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
+                                                silu_and_mul_out.shape[-1])
+    m, n = silu_and_mul_out.shape
+    device = silu_and_mul_out.device
+
+    # Two fp4 values will be packed into an uint8.
+    out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    output_scale = ref_output_scale
+
+    torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
+                                  global_scale)
+
+    return out, output_scale
+
+
+def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 4)
+    output_scale = ref_output_scale
+    out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
+    torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
+    return out, output_scale
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    block_size = 16
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert x.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
+
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(x.shape[0], 128)
+    scale_n = x.shape[1] // (2 * block_size)
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=x.device,
+                               dtype=torch.int32)
+
+    layer = SiluAndMul()
+
+    ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
+
+    fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
+
+    assert ref_out.dtype == torch.uint8
+    assert fusion_out.dtype == torch.uint8
+    assert ref_out.shape == fusion_out.shape
+
+    assert ref_out_scale.dtype == torch.int32
+    assert fusion_out_scale.dtype == torch.int32
+    assert ref_out_scale.shape == fusion_out_scale.shape
+
+    # Allow up to 2% of mismatched values since BF16 has accuracy issues.
+    mis_threshold = 0.02
+    atol = 0.4
+    rtol = 0.4
+    ref_logits = ref_out[-1]
+    fusion_logits = fusion_out[-1]
+
+    mis_count = torch.sum(
+        torch.abs(fusion_logits - ref_logits) > (atol +
+                                                 rtol * torch.abs(ref_logits)))
+    mis_ratio = mis_count / fusion_logits.numel()
+
+    assert mis_ratio < mis_threshold, \
+        f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
+
+    torch.testing.assert_close(ref_out_scale, fusion_out_scale)
+
+    opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
+            (fusion_out, fusion_out_scale, x, global_scale))
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
                   enable_lora=True,
@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                   tensor_parallel_size=4,
                   trust_remote_code=True,
                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/lora/test_llm_with_multi_loras.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():

        output_text = call_llm_get_outputs(prompt, "Alice")
        check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -92,7 +92,8 @@ AITER_MODEL_LIST = [
        pytest.param(
            "allenai/OLMoE-1B-7B-0924-Instruct",
            marks=[pytest.mark.cpu_model],
-        )
+        ),
+        pytest.param("swiss-ai/Apertus-8B"),  # apertus
    ])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -100,21 +100,19 @@ def test_models(
        else:
            hf_outputs = None

-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None

    if model in V1_SUPPORTED_MODELS:
-        with monkeypatch.context() as m:
-            m.setenv("VLLM_USE_V1", "1")
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enable_prefix_caching=False) as vllm_model:
-                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
    else:
        vllm_v1_outputs = None

@ -137,7 +135,7 @@ def test_models(
        )


-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
@ -147,10 +145,6 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    if model in V0_UNSUPPORTED_MODELS:
-        pytest.skip(
-            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@ -188,29 +182,32 @@ def test_chunked_prefill(
    max_tokens: int,
    num_logprobs: int,
    chunked_prefill_token_size: int,
+    monkeypatch,
 ) -> None:
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size

-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         enable_chunked_prefill=True,
+                         max_num_batched_tokens=max_num_batched_tokens,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model,
+                         enable_chunked_prefill=False,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            non_chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)

-    check_logprobs_close(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
+        check_logprobs_close(
+            outputs_0_lst=chunked,
+            outputs_1_lst=non_chunked,
+            name_0="chunked",
+            name_1="non_chunked",
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -281,25 +278,29 @@ def test_models_preemption_recompute(
    example_prompts,
    model: str,
    max_tokens: int,
+    monkeypatch,
 ) -> None:
    """
    Tests that outputs are identical with and w/o preemptions (recompute).
    """
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.llm.llm_engine.scheduler[0]
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            scheduler = vllm_model.llm.llm_engine.scheduler[0]
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+            preempt_vllm_outputs = vllm_model.generate_greedy(
+                example_prompts, max_tokens)

-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=preempt_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="vllm_preepmtions",
+            name_1="vllm",
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -402,24 +403,18 @@ def test_full_cuda_graph(
        else:
            hf_outputs = None

-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
-
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         compilation_config={'full_cuda_graph': True},
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

    if hf_outputs is not None and vllm_v0_outputs is not None:
        check_logprobs_close(
@ -466,24 +461,20 @@ def test_fp32_state(
        else:
            hf_outputs = None

+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
    with vllm_runner(model,
                     max_num_seqs=MAX_NUM_SEQS,
                     mamba_ssm_cache_dtype="float32") as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32",
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
    if hf_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,
@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@ -13,7 +13,14 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models

 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification"),
+                               architecture="GemmaForSequenceClassification",
+                               hf_overrides={
+                                   "architectures":
+                                   ["GemmaForSequenceClassification"],
+                                   "classifier_from_token": ["Yes"],
+                                   "method":
+                                   "no_post_processing",
+                               }),
 ]

 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):


@pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
-                            monkeypatch) -> None:
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    assert model_info.architecture == "GemmaForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["GemmaForSequenceClassification"],
-            "classifier_from_token": ["Yes"],
-            "method": "no_post_processing",
-        }
-    }
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:

    mteb_test_rerank_models(GemmaRerankerHfRunner,
                            vllm_runner,
                            model_info,
-                            vllm_extra_kwargs,
                            vllm_mteb_encoder=GemmaMtebEncoder)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -10,14 +10,6 @@ from vllm.platforms import current_platform
 from ...utils import check_embeddings_close, check_transformers_version


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.mark.parametrize(
    "model",
    [
@ -32,21 +24,15 @@ def v1(run_with_both_engines):
            "intfloat/e5-mistral-7b-instruct",
            # CPU v1 doesn't support sliding window
            marks=[pytest.mark.core_model]),
-        # the qwen models interfere with each other (see PR
-        # https://github.com/vllm-project/vllm/pull/18720).
-        # To avoid this problem, for now we skip v0 since it will be
-        # deprecated anyway.
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+                     marks=[pytest.mark.cpu_model]),
        # [Encoder-only]
        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
        # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
    ],
 )
 def test_models(
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any

 import pytest

@ -33,12 +32,15 @@ MODELS = [
    ########### NewModel
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    ########### Qwen2ForCausalLM
    LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
@ -60,11 +62,16 @@ MODELS = [
 ]

 RERANK_MODELS = [
-    # classifier_pooling: mean
    CLSPoolingRerankModelInfo(
+        # classifier_pooling: mean
        "Alibaba-NLP/gte-reranker-modernbert-base",
        architecture="ModernBertForSequenceClassification",
        enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "Alibaba-NLP/gte-multilingual-reranker-base",
+        architecture="GteNewForSequenceClassification",
+        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        enable_test=True),
 ]


@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
        check_transformers_version(model_info.name,
                                   max_transformers_version="4.53.2")

-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
-                           vllm_extra_kwargs)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
        check_transformers_version(model_info.name,
                                   max_transformers_version="4.53.2")

-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts, vllm_extra_kwargs)
+                                  example_prompts)


@pytest.mark.parametrize("model_info", RERANK_MODELS)
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@ -10,12 +10,20 @@ from tests.conftest import HfRunner
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models

+mxbai_rerank_hf_overrides = {
+    "architectures": ["Qwen2ForSequenceClassification"],
+    "classifier_from_token": ["0", "1"],
+    "method": "from_2_way_softmax",
+}
+
 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                               architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                               enable_test=True),
    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                               architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                               enable_test=False)
 ]

@ -71,13 +79,4 @@ class MxbaiRerankerHfRunner(HfRunner):

@pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "Qwen2ForSequenceClassification":
-        vllm_extra_kwargs["hf_overrides"] = {
-            "architectures": ["Qwen2ForSequenceClassification"],
-            "classifier_from_token": ["0", "1"],
-            "method": "from_2_way_softmax",
-        }
-
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@ -11,12 +11,20 @@ from tests.utils import multi_gpu_test
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models

+qwen3_reranker_hf_overrides = {
+    "architectures": ["Qwen3ForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+
 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                               architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                               enable_test=True),
    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
                               architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                               enable_test=False)
 ]

@ -74,18 +82,7 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:

-    assert model_info.architecture == "Qwen3ForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        }
-    }
-
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", RERANK_MODELS)
@ -96,16 +93,8 @@ def test_rerank_models_mteb_tp(vllm_runner,
    assert model_info.architecture == "Qwen3ForSequenceClassification"

    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
        "tensor_parallel_size": 2,
    }

-    mteb_test_rerank_models(Qwen3RerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            atol=1.2e-2)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@ -13,14 +13,6 @@ from ....conftest import HfRunner
 from ...utils import check_transformers_version


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture
 def math_step_prompts():
    # ruff: noqa: E501
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@ -23,15 +23,6 @@ TEXTS_2 = [
    "The capital of Germany is Berlin.",
 ]

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 DTYPE = "half"


--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = {
        },
        marks=[pytest.mark.core_model],
    ),
-    # FIXME(Isotr0py): Enable this test after
-    # https://github.com/huggingface/transformers/pull/39470 released
-    # "idefics3-transformers": VLMTestInfo(
-    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<image>",
-    #     max_model_len=8192,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
-    #     image_size_factors=[(0.25, 0.5, 1.0)],
-    #     vllm_runner_kwargs={
-    #         "model_impl": "transformers",
-    #     },
-    #     marks=[pytest.mark.core_model],
-    # ),
+    "idefics3-transformers": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@ -322,10 +320,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
-        # should enable this again after the fix is released:
-        # https://github.com/huggingface/transformers/pull/39915
-        marks=[pytest.mark.skip("HF model is broken")],
    ),
    "gemma3": VLMTestInfo(
        models=["google/gemma-3-4b-it"],
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@ -1,12 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from io import BytesIO
 from typing import Callable

-import requests
-from PIL import Image
-
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():


 def windows_attention_image_qwen2_5_vl():
-    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
-    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
+    image = ImageAsset("hato").pil_image

    question = "Describe the image."
    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -293,6 +293,7 @@ def _test_processing_correctness_one(
    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
    "OpenGVLab/InternVL3_5-30B-A3B",
    "Kwai-Keye/Keye-VL-8B-Preview",
+    "Kwai-Keye/Keye-VL-1_5-8B",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "llava-hf/llava-1.5-7b-hf",
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -1,30 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
 from collections.abc import Iterable
+from contextlib import contextmanager
 from functools import partial
 from typing import Any, Union
-from unittest.mock import patch

 import numpy as np
 import pytest
+import torch.nn as nn
 from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

-from vllm.config import ModelConfig
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
-from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-from vllm.v1.engine.core import EngineCore as V1EngineCore
+from vllm.utils import is_list_of

-from ....conftest import VllmRunner
 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides

@ -137,6 +138,27 @@ def create_batched_mm_kwargs(
    return group_mm_kwargs_by_modality(items)


+@contextmanager
+def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=1)
+    vllm_config = VllmConfig(model_config=model_config)
+    with set_current_vllm_config(vllm_config=vllm_config):
+        with set_default_torch_dtype(model_config.dtype):
+            model = model_cls(vllm_config=vllm_config)
+        yield model
+
+    del model
+    cleanup_dist_env_and_memory()
+
+
 def get_model_id_to_test(
        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
    filtered_results = []
@ -155,8 +177,7 @@ def get_model_id_to_test(
@pytest.mark.parametrize(
    "model_arch, model_id",
    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, model_id: str,
-                             vllm_runner: type[VllmRunner], monkeypatch):
+def test_model_tensor_schema(model_arch: str, model_id: str):
    if model_arch in ARCH_TO_SKIP:
        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
    if model_id in REPO_ID_TO_SKIP:
@ -177,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+        hf_overrides=hf_overrides_fn,
    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]

-    if not any(
-            hasattr(model_cls, f"_parse_and_validate_{m}_input")
-            for m in ["image", "video", "audio"]):
+    inputs_parse_methods = []
+    for attr_name in dir(model_cls):
+        attr = getattr(model_cls, attr_name)
+        if hasattr(attr, "__annotations__"):
+            return_type = attr.__annotations__.get("return", None)
+            if return_type is not None and "Input" in str(return_type):
+                inputs_parse_methods.append(attr_name)
+
+    if not any(inputs_parse_methods):
        pytest.skip(f"{model_arch} does not support tensor schema validation.")

    ctx = InputProcessingContext(
@ -197,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
        modality: 3 if limit is None else limit
        for modality, limit in supported_mm_limits.items()
    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+    processor = factories.build_processor(ctx, cache=None)

-    # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
-    def _initialize_kv_caches_v1(self, vllm_config):
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-        scheduler_kv_cache_config = get_kv_cache_config(
-            vllm_config,
-            kv_cache_specs[0],
-            10 * GiB_bytes,
-        )
-
-        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-        return 1, 0, scheduler_kv_cache_config
-
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
-
-        # TODO(Isotr0py): Can we avoid initializing engine?
-        with (
-                set_default_torch_num_threads(1),
-                vllm_runner(
-                    model_id,
-                    tokenizer_name=model_info.tokenizer,
-                    tokenizer_mode=model_info.tokenizer_mode,
-                    revision=model_info.revision,
-                    trust_remote_code=model_info.trust_remote_code,
-                    max_model_len=model_info.max_model_len,
-                    load_format="dummy",
-                    hf_overrides=hf_overrides_fn,
-                    limit_mm_per_prompt=limit_mm_per_prompt,
-                    enforce_eager=True,
-                ) as vllm_model,
-        ):
-            model_config = vllm_model.llm.llm_engine.model_config
-            llm_engine = vllm_model.llm.llm_engine
-
-            if hasattr(llm_engine, "processor"):
-                # v1 processor
-                mm_registry = llm_engine.processor.mm_registry
-            else:
-                # v0 input_preprocessor
-                mm_registry = llm_engine.input_preprocessor.mm_registry
-
-            processor = mm_registry.create_processor(model_config)
-
-            def validate_model_input(model, modality: str,
-                                     mm_kwargs: MultiModalKwargs):
-                method_name = f"_parse_and_validate_{modality}_input"
-                if hasattr(model, method_name):
-                    getattr(model, method_name)(**mm_kwargs)
-
-            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                    model_config, processor):
-                valid_func = partial(validate_model_input,
-                                     modality=modality,
-                                     mm_kwargs=mm_kwargs)
-                vllm_model.apply_model(valid_func)
+    with initialize_dummy_model(model_cls, model_config) as model:
+        for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                model_config, processor):
+            for method_name in inputs_parse_methods:
+                print(f"Testing `{method_name}` with modality={modality} "
+                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                getattr(model, method_name)(modality=modality, **mm_kwargs)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -137,6 +137,9 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+                                          min_transformers_version="4.56.0",
+                                          trust_remote_code=True),
    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
                                   trust_remote_code=True),
    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
@ -323,8 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {

 _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
-    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
@ -337,9 +340,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
    "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True, v0_only=True),
+                                trust_remote_code=True),
    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+                                               trust_remote_code=True),  # noqa: E501
    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
                                           max_transformers_version="4.53",
@ -347,9 +350,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
                                                  max_transformers_version="4.53",
                                                  transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),  # noqa: E501
    # [Multimodal]
    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@ -364,16 +367,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501

    # [Cross-encoder]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
+    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
+                                                       trust_remote_code=True,
+                                                       hf_overrides={
+                                                           "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
 }

 _AUTOMATIC_CONVERTED_MODELS = {
    # Use as_seq_cls_model for automatic conversion
    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      v0_only=True,
                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
                                                                    "method": "no_post_processing"}),  # noqa: E501
@ -432,6 +438,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                    trust_remote_code=True),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
+                                                         trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                      trust_remote_code=True),
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS

@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Skip if transformers version is incompatible
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
    # Ensure all model classes can be imported successfully
    model_cls = ModelRegistry._try_load_model_cls(model_arch)
    assert model_cls is not None
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -3,7 +3,8 @@

 import warnings
 from collections.abc import Sequence
-from typing import Any, NamedTuple, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Optional, Union

 import torch
 import torch.nn.functional as F
@ -339,36 +340,43 @@ def softmax(data):
        return F.softmax(data, dim=-1)


-class EmbedModelInfo(NamedTuple):
+@dataclass
+class ModelInfo:
    name: str
-    is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
    architecture: str = ""
    dtype: str = "auto"
+    hf_overrides: Optional[dict[str, Any]] = None
    default_pooling_type: str = ""
    enable_test: bool = True


+@dataclass
+class EmbedModelInfo(ModelInfo):
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+
+
+@dataclass
 class CLSPoolingEmbedModelInfo(EmbedModelInfo):
    default_pooling_type: str = "CLS"


+@dataclass
 class LASTPoolingEmbedModelInfo(EmbedModelInfo):
    default_pooling_type: str = "LAST"


-class RerankModelInfo(NamedTuple):
-    name: str
-    architecture: str = ""
-    dtype: str = "auto"
-    default_pooling_type: str = ""
-    enable_test: bool = True
+@dataclass
+class RerankModelInfo(ModelInfo):
+    pass


+@dataclass
 class CLSPoolingRerankModelInfo(RerankModelInfo):
    default_pooling_type: str = "CLS"


+@dataclass
 class LASTPoolingRerankModelInfo(RerankModelInfo):
    default_pooling_type: str = "LAST"

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	fefed35cee	fix Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 18:58:00 -07:00
Woosuk Kwon	901afda905	wip Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 09:32:49 -07:00
Woosuk Kwon	39a22dcaac	[Misc] Minor code simplification for spec decode (#24053 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 08:54:01 -07:00
Julien Debache	41c80698b3	Document multi-proc method selection for profiling (#23802 ) Signed-off-by: jdebache <jdebache@nvidia.com>	2025-09-01 06:28:26 -07:00
Kwai-Keye	7c8271cd1e	[Model]: support KeyeVL-1_5-8B (#23838 ) Signed-off-by: wangruitao <wangruitao@kuaishou.com> Co-authored-by: wangruitao <wangruitao@kuaishou.com>	2025-09-01 03:50:27 -07:00
Kay Yan	3e330fcb21	[Doc]: Fix CPU install docs: force torch-backend=cpu to avoid GPU torchvision errors (#24033 ) Signed-off-by: Kay Yan <kay.yan@daocloud.io>	2025-09-01 03:34:52 -07:00
Nicolò Lucchesi	d46934b229	[Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735 ) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-09-01 18:07:46 +08:00
Didier Durand	107284959a	[Doc]: fix typos in Python comments (#24026 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-09-01 09:38:20 +00:00
Jee Jee Li	dc1a53186d	[Kernel] Update DeepGEMM to latest commit (#23915 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2025-09-01 02:38:04 -07:00
wang.yuqi	55602bb2e6	[Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904 ) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-09-01 08:50:25 +00:00
Isotr0py	d7fbc6ddac	[Misc] Enable V1 FP16 inference on pre-Ampere GPUs (#24022 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 08:12:22 +00:00
Ning Xie	5438967fbc	[Misc] add hash_function doc string (#24014 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-31 23:11:20 -07:00
Code Jesus	422e793fa6	[Bugfix] Add support for `<tool_call>` format in streaming mode for XLAM Tool Parser (#22769 ) Signed-off-by: Devon Peroutky <devon@kindo.ai>	2025-09-01 14:07:54 +08:00
Christian Pinto	1cb39dbcdd	[Misc] IO Processor plugins for pooling models (#22820 ) Signed-off-by: Christian Pinto <christian.pinto@ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com>	2025-08-31 23:07:12 -07:00
Benji Beck	437c3ce026	Migrate Phi4 inputs to TensorSchema (#23471 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-09-01 14:05:59 +08:00
Ning Xie	499b074bfd	[Misc] refactor code by import as for torch._inductor.config (#23677 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-09-01 14:05:42 +08:00
Isotr0py	ff0e59d83a	[CI/Build] Improve Tensor Schema tests speed by avoid engine core initialization (#23357 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-08-31 22:52:20 -07:00
Woosuk Kwon	b55713683c	[Misc] Move fast prefill logic to separate method (#24013 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 05:40:38 +00:00
Jun-Howie	acc1a6e10a	Fix the bug related to loading GPTP INT3 weights. (#23328 ) Signed-off-by: JunHowie <JunHowie@aliyun.com> Co-authored-by: JunHowie <JunHowie@aliyun.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 05:39:57 +00:00
Woosuk Kwon	8c742a66d1	[Misc] Avoid redundant copy for encoder-only models (#24012 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-01 04:02:43 +00:00
JartX	183a70967a	[BUGFIX] GPTQ quantization compatibility for Qwen3 MOE models (AutoGPTQ and AutoRound-GPTQ) (#23994 ) Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-09-01 03:33:40 +00:00
Or Ozeri	14b4326b94	v1: Support KV events from connectors (#19737 ) Signed-off-by: Or Ozeri <oro@il.ibm.com>	2025-09-01 01:13:21 +00:00
Nick Hill	752d2e1c36	[Minor] Fix some random typos in comments (#24009 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-08-31 16:42:17 -07:00
Xiaodong Wang	81eea3d348	vllm fix check on max vocab size (#22471 ) Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me>	2025-08-31 20:57:05 +08:00
Didier Durand	9701352e4b	[Doc]: fix typos in Python comments (#24001 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-08-31 08:21:59 +00:00
Roger Wang	749be00a98	[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-30 18:01:22 -07:00
Gabriel Marinho	5b8077b8ac	Fix wrong truncate_prompt_tokens type hint (#22761 ) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com>	2025-08-30 20:39:38 +00:00
Andy Lo	038e9be4eb	[LoRA] Much faster startup when LoRA is enabled (#23777 ) Signed-off-by: Andy Lo <andy@mistral.ai> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2025-08-30 15:37:39 +00:00
Ning Xie	68a349114f	[Misc] enhance type hint for rearrange return value (#23519 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:43:33 -07:00
Ning Xie	e80bca309e	[Refactor] refactor freezing_value/cuda_event initialize outside try finally (#23758 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:42:25 -07:00
Ning Xie	fb4983e112	[Misc] add reorder_batch AttentionMetadataBuilder (#23798 ) Signed-off-by: Andy Xie <andy.xning@gmail.com>	2025-08-30 06:41:45 -07:00
sadegh.shokatian	379ea2823a	Add LoRA support for DeepSeek models (V2, V3, R1-0528) (#23971 ) Signed-off-by: sadeghja1070 <sadegh.ja1070@gmail.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-30 06:40:02 -07:00
Jiangyun Zhu	3a6acad431	[Model] Enable encoder DP for MiniCPM-V (#23948 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-08-30 06:31:26 -07:00
Ning Xie	5490d633ce	[UT] fix unify_kv_cache_configs when kv cache config needs sort (#23843 )	2025-08-30 11:22:14 +00:00
Jee Jee Li	628d00cd7b	[Bugfix] Fix test_lora_resolvers.py (#23984 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-08-30 11:16:11 +00:00
Thomas Parnell	4071c76cf3	[V1] [Hybrid] Move MiniMaxLinearAttention into layers/mamba (#23831 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-30 00:16:15 -07:00
Cyrus Leung	f1bddbd852	[Core] Cleanup TPU model runner for MM (#23894 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-08-30 00:14:58 -07:00
Yong Hoon Shin	9748c5198b	[CI] Fix broken compile tests due to unsupported SiluMul+Nvfp4Quant fusion (#23973 ) Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Co-authored-by: Roger Wang <hey@rogerw.io>	2025-08-30 00:14:43 -07:00
Roger Wang	ee52a32705	[CI] Move testing image from remote URL to S3 (#23980 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-29 21:41:25 -07:00
Xin Yang	8fb85b7bb6	Add routed_scaling_factor to MoE grouped topk (#23123 ) Signed-off-by: Xin Yang <xyangx@amazon.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-29 21:36:48 -07:00
dubejf	5b31cb1781	[Bugfix] Fix --config arg expansion called from api_server.py (#23944 ) Signed-off-by: Jean-Francois Dube <dubejf+gh@gmail.com> Co-authored-by: Jean-Francois Dube <dubejf+gh@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-29 21:36:39 -07:00
Roger Wang	d660c98c1b	[CI] Fix unavailable image remote URL (#23966 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-29 15:40:04 -07:00
Harry Mellor	5674a40366	[Misc] Make `download_weights_from_hf` more reliable (#23863 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-08-29 12:37:24 -07:00
Yong Hoon Shin	8c3e199998	Revert gemma3n fast prefill changes (#23897 ) Signed-off-by: Yong Hoon Shin <yhshin@meta.com>	2025-08-29 12:16:57 -07:00
Thomas Parnell	1c26b42296	[Docs] [V1] [Hybrid] Add new documentation re: contributing mamba-based models (#23824 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-08-29 18:47:58 +00:00
Michael Goin	b7adf94c4a	Tuned H100/H200 triton fp8 block configs for fused_qkv_a_proj (#23939 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-08-29 10:28:35 -07:00
22quinn	4d7fe40fc0	[RL][BugFix] Fix missing tokenizer error for token-in-token-out (#23904 ) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-08-30 01:09:55 +08:00
yzds	0dc9532065	[BUGFIX ] fix undefined silu_and_mul_nvfp4_quant (#23929 ) Signed-off-by: hongchao <hongchao@msh.team> Signed-off-by: Richard Zou <zou3519@gmail.com> Co-authored-by: hongchao <hongchao@msh.team> Co-authored-by: Richard Zou <zou3519@gmail.com> Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>	2025-08-29 09:36:39 -07:00
vllmellm	72a69132dc	[CI] Add `aiter` to matching list of issue auto labeller for `rocm` tag (#23942 ) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>	2025-08-29 15:29:21 +00:00
Nick Hill	d90d8eb674	[BugFix] Async scheduling and PP compatibility with DP (#23770 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-08-29 08:17:27 -07:00
Lukas Geiger	0a2f4c0793	[Models] Use in-place adds in Idefics2Vision (#23932 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-08-29 07:42:57 -07:00
EduardDurech	1cf3753b90	[MODEL] `Apertus` and `XIELU` (#23068 ) Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com> Co-authored-by: AllenHaoHuang <allenhuangdd@gmail.com>	2025-08-29 20:29:18 +08:00
Adit Chawdhary	4f7cde7272	Adds `json_count_leaves` utility function (#23899 ) Signed-off-by: aditchawdhary <aditxy@hotmail.com>	2025-08-29 05:28:13 -07:00
Huy Do	67c14906aa	Update PyTorch to 2.8.0 (#20358 ) Signed-off-by: Huy Do <huydhn@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-08-29 18:57:35 +08:00
Flora Feng	69f46359dd	[Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779 ) Signed-off-by: sfeng33 <4florafeng@gmail.com>	2025-08-29 18:36:57 +08:00
wang.yuqi	d9e00dbd1f	[Performance] V1 Classify Models E2E Performance Optimization (#23541 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-08-29 03:12:32 -07:00
Li, Jiang	ad39106b16	[CPU] Enable data parallel for CPU backend (#23903 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-08-29 02:19:58 -07:00
Maximilien de Bayser	2554b27baa	[V0 Deprecation] Remove pooling model support in V0 (#23434 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-08-29 00:04:02 -07:00
Harry Mellor	934bebf192	Better errors for Transformers backend missing features (#23759 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-08-29 07:01:40 +00:00
Jiangyun Zhu	885ca6d31d	[Misc] Fix warnings for mistral model (#23552 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2025-08-29 06:58:48 +00:00
Chenheli Hua	2d0afcc9dc	[mrope][Qwen2-VL] Fix edge case where getting index of image/video token can potentially throw in default vl mrope implementation. (#23895 ) Signed-off-by: Chenheli Hua <huachenheli@outlook.com>	2025-08-28 23:29:13 -07:00
Jee Jee Li	b4f9e9631c	[CI/Build] Clean up LoRA test (#23890 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-08-28 23:28:35 -07:00
Raghavan	05d839c19e	Fix(async): Add support for truncate_prompt_tokens in AsyncLLM (#23800 )	2025-08-28 22:55:06 -07:00
wangxiyuan	6597d7a456	[Platform] import activation_quant_fusion for CUDA only (#23882 ) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>	2025-08-28 22:54:16 -07:00
Jinghui Zhang	5264015d74	[BugFix][AMD][Deepseek] fix a dtype mismatch error for deepseek running on AMD (#23864 ) Signed-off-by: Jinghui Zhang <jinghuizhang0804@gmail.com>	2025-08-28 22:54:12 -07:00
Isotr0py	98ac0cb32d	[Bugfix] Use `ReplicatedLinear` for SequenceClassification head (#23836 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-08-29 04:41:20 +00:00
Russell Bryant	c8b3b299c9	[tests] Improve speed and reliability of test_transcription_api_correctness (#23854 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-08-29 04:25:33 +00:00
Charlie Fu	006477e60b	[ROCm][Fix] Fix rocm build caused by #23791 (#23847 ) Signed-off-by: charlifu <charlifu@amd.com>	2025-08-28 19:52:27 -07:00
Lukas Geiger	de533ab2a1	[Models] Improve iteration over layers (#19497 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-08-29 09:26:34 +08:00
Chaojun Zhang	235c9db8a7	[XPU] support data parallel for MoE models on XPU (#22887 ) Signed-off-by: chzhang <chaojun.zhang@intel.com>	2025-08-29 09:23:04 +08:00
Woosuk Kwon	b668055a11	[V0 Deprecation] Remove V0 Samplers test (#23862 )	2025-08-28 18:05:52 -07:00
Wentao Ye	d3d2aad5a2	[Log] Use Debug Once for DeepGEMM E8M0 When not Enabled (#23858 )	2025-08-28 22:18:10 +00:00
Yong Hoon Shin	cb293f6a79	[V1] Enable prefill optimization for Gemma3n (#22628 ) Signed-off-by: Yong Hoon Shin <yhshin@meta.com>	2025-08-28 14:54:30 -07:00
Woosuk Kwon	7ffbf27239	[BugFix][FlashInfer] Fix potential race condition for paged_kv_indptr_cpu (#23737 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-08-28 14:22:46 -07:00
Simon Mo	27e88cee74	chore: build release image by default (#23852 ) Signed-off-by: Codex <codex@openai.com>	2025-08-28 13:17:15 -07:00
elvischenv	16a45b3a28	[NVIDIA] Support SiluMul + NVFP4 quant fusion (#23671 ) Signed-off-by: jindih <jindih@nvidia.com> Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: jindih <jindih@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Luka Govedic <lgovedic@redhat.com>	2025-08-28 19:36:50 +00:00
Jingkai He	57d4ede520	[bugfix] [spec-decoding] fix data race in sample_recovered_tokens_kernel (vLLM v1) (#23829 ) Signed-off-by: He-Jingkai <he-jingkai@outlook.com>	2025-08-28 19:05:20 +00:00
Divakar Verma	04d1dd7f4a	[ROCm][Aiter] Add triton fp8 bmm kernel for mla (#23264 ) Signed-off-by: Divakar Verma <divakar.verma@amd.com> Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>	2025-08-28 18:18:08 +00:00
Benji Beck	f32a5bc505	Migrate Llama4ImagePatchInputs to TensorSchema (#22021 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-08-28 17:29:37 +00:00
Jean Schmidt	8805ad9fa9	Add scale_config.yml file for Meta autoscalers for GH Actions (#23840 ) Signed-off-by: Jean Schmidt <contato@jschmidt.me>	2025-08-28 09:31:20 -07:00
Jean Schmidt	0583578f42	[ci] breaks down V1 Test into 3 groups of approx 30 minutes runtime (#23757 ) Signed-off-by: Jean Schmidt <contato@jschmidt.me>	2025-08-28 08:59:19 -07:00
Angela Yi	db74d60490	[Bugfix] Add fake mode around passes (#23349 ) Signed-off-by: angelayi <yiangela7@gmail.com>	2025-08-28 11:25:56 -04:00
Po-Han Huang (NVIDIA)	95089607fa	[Model][gpt-oss] Support DP+EP for GPT-OSS with FlashInfer trtllm-gen MoE (#23819 ) Signed-off-by: Po-Han Huang <pohanh@nvidia.com>	2025-08-28 06:56:20 -07:00
Thomas Parnell	1f096f9b95	[CI] Fix linting error on main (#23835 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-08-28 06:52:01 -07:00
YUQI.CHENG	66548f6603	[Bugfix] Fix benchmark_moe.py for blockwise fp8. (#23823 ) Signed-off-by: crischeng <420985011@qq.com> Co-authored-by: cris <grace@guisenbindeMacBook-Pro.local>	2025-08-28 21:44:09 +08:00
Didier Durand	d3da2eea54	[Doc]: fix typos in Python scripts (#23828 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-08-28 05:37:38 -07:00
Jiangyun Zhu	bfab219648	[Model] [gpt-oss] fix gpt-oss pp support (#23815 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>	2025-08-28 05:36:55 -07:00
Woosuk Kwon	a3432f18fd	[BugFix][Spec Decode] Use float64 for uniform_probs (#23803 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-08-28 12:26:45 +00:00
Li, Jiang	67cee40da0	[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-08-28 11:57:05 +00:00
Didier Durand	d99c3a4f7b	[Doc]: fix typos in .md files (including those of #23751 ) (#23825 ) Signed-off-by: Didier Durand <durand.didier@gmail.com>	2025-08-28 04:38:19 -07:00
JartX	3462c1c522	[FIXBUG] Add return_success parameter to moe_wna16_weight_loader function (#22797 ) Signed-off-by: JartX <sagformas@epdcenter.es> Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-08-28 09:03:22 +00:00
Isotr0py	c5d004aaaf	[Model] Add PP support and VLM backbone compatability for GPT-OSS (#23680 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-08-28 16:03:28 +08:00
wang.yuqi	11a7fafaa8	[New Model]: Support GteNewModelForSequenceClassification (#23524 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-08-28 15:36:42 +08:00
yzds	186aced5ff	[Kernel] cuda kernels for upcoming decode context parallel feature (#23791 ) Co-authored-by: hongchao <hongchao@msh.team>	2025-08-28 15:29:11 +08:00
rongfu.leng	daa1273b14	[Bugfix] when set offline model running error (#23711 ) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>	2025-08-28 07:27:45 +00:00
Jiangyun Zhu	c07a73317d	[CI] enable idefics3 and fuyu-8b test in multimodal test (#23790 ) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>	2025-08-28 14:51:24 +08:00
Kyle Sayers	22feac8e95	[Transform] [Quantization] Add transforms to compressed tensors (#22486 )	2025-08-28 02:43:48 -04:00
Jinheng	c8851a4723	Add deprecation warning for lora_extra_vocab_size (#23635 ) Signed-off-by: Jinheng Li <ahengljh@gmail.com>	2025-08-27 22:34:29 -07:00
Alex	f48a9af892	[CI] make all multi-gpu weight loading tests run nightly (#23792 ) Signed-off-by: Alex Yun <alexyun04@gmail.com>	2025-08-27 21:27:36 -07:00
Jan Kessler	a11adafdca	Gracefully handle edge cases in harmony utils (#23155 ) Signed-off-by: Jan Kessler <jakessle@uni-mainz.de> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-08-27 20:14:00 -07:00
Michael Goin	a781e84ec2	[Perf] Tune configs for triton block fp8 gemm H100/H200 (#23748 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-08-28 11:12:53 +08:00
Shrey Gupta	1b7b161a09	[Feature] models: pass layer prefix to replace_linear_class for per-layer quantization routing. Addresses #23239 (#23556 ) Signed-off-by: Shrey Gupta <shreyg1303@gmail.com>	2025-08-27 20:12:44 -07:00
Benji Beck	a69693e38f	Migrate Qwen inputs to TensorSchema (#23473 ) Signed-off-by: Benji Beck <benjibeck@meta.com>	2025-08-28 10:43:26 +08:00
Hanchenli	5da4f5d857	[Bugfix] Fix for V1 priority scheduling crashes at preemption (#23713 ) Signed-off-by: Hanchenli <lihanc2002@gmail.com>	2025-08-28 00:44:52 +00:00
Wentao Ye	321938e9ac	[Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to Avoid Hang Issue (#23595 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-08-27 21:52:24 +00:00
Michael Goin	f9ca2b40a0	[Bugfix] Fix Marlin NVFP4 for modelopt (#23659 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-08-27 17:48:16 -04:00
Yongye Zhu	082cc07ef8	DP/EP Support for gpt-oss with deepep-ht comm kernel on SM100 (#23608 )	2025-08-27 17:33:21 -04:00
Asaf Joseph Gardin	853c371fc3	[V1][Mamba] - Enable V1 by default for Mamba Models (#23650 ) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>	2025-08-27 20:53:30 +00:00
Roger Wang	8bf6266a17	[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690 ) Signed-off-by: Roger Wang <hey@rogerw.io>	2025-08-27 20:24:31 +00:00