updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
updated
2025-07-19 20:02:37 +00:00 · 2025-07-19 19:47:01 +00:00 · 2025-07-19 18:17:53 +00:00 · 2025-07-19 18:13:13 +00:00 · 2025-07-19 17:54:07 +00:00 · 2025-07-19 16:31:48 +00:00
453 changed files with 22250 additions and 24388 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -24,8 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -6,19 +6,17 @@ set -exuo pipefail

 # Try building the docker image
 cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
-FROM 1.22-413-pt2.7.1:latest
+FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm

-RUN pip install -v -r requirements/hpu.txt
-RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
-
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -70,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"

 echo "--- Hardware Information ---"
-tpu-info
+# tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -117,7 +117,7 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test # 40min
+- label: Entrypoints Test (LLM) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -125,8 +125,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
-  - tests/entrypoints/openai
-  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -135,9 +133,21 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Test (API Server) # 40min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  mirror_hardwares: [amdexperimental]
@ -149,7 +159,6 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
@ -172,7 +181,6 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -320,17 +328,6 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: Speculative decoding tests # 40min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/spec_decode
-  - tests/spec_decode
-  - vllm/model_executor/models/eagle.py
-  commands:
-    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
-    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
-
 - label: LoRA Test %N # 15min each
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
@ -630,6 +627,18 @@ steps:
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

+- label: Transformers Nightly Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/audio_language.py --model-type whisper
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@ -704,7 +713,6 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
-  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -0,0 +1,6 @@
+# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
+have_fun: false  # Just review the code
+code_review:
+  comment_severity_threshold: HIGH  # Reduce quantity of comments
+  pull_request_opened:
+    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,6 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
@ -42,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -164,10 +164,7 @@ pull_request_rules:
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
-      - files~=^vllm/spec_decode/
      - files~=^vllm/v1/spec_decode/
-      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^tests/spec_decode/
      - files~=^tests/v1/spec_decode/
      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
      - files~=^vllm/model_executor/models/.*eagle.*\.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.32.0
+  rev: v1.34.0
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
@ -166,7 +166,7 @@ repos:
    language: python
    types: [python]
    pass_filenames: true
-    files: vllm/config.py|tests/test_config.py
+    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
@ -553,7 +553,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu")
+      "csrc/attention/mla/cutlass_mla_kernels.cu"
+      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${MLA_ARCHS}")
--- a/README.md
+++ b/README.md
@ -63,8 +63,6 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill

-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
-
 vLLM is flexible and easy to use with:

 - Seamless integration with popular Hugging Face models
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -0,0 +1,137 @@
+# Automated vLLM Server Parameter Tuning
+
+This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Configuration](#configuration)
+- [How to Run](#how-to-run)
+- [Example Use Cases](#example-use-cases)
+- [Output](#output)
+- [How It Works](#how-it-works)
+
+## Prerequisites
+
+Before running the script, please ensure the following steps are completed:
+
+1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+# git checkout <your-branch>
+```
+
+1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
+
+2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
+
+## Configuration
+
+You must set the following variables at the top of the script before execution.
+
+| Variable | Description | Example Value |
+| --- | --- | --- |
+| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
+| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
+| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `TP` | **Required.** The tensor-parallelism size. | `1` |
+| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
+| `INPUT_LEN` | **Required.** Request input length. | `4000` |
+| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
+| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
+| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
+| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
+
+**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
+
+## How to Run
+
+1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
+2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
+
+```
+cd <FOLDER_OF_THIS_SCRIPT>
+bash auto_tune.sh
+```
+
+    Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
+
+## Example Use Cases
+
+Here are a few examples of how to configure the script for different goals:
+
+### 1. Maximize Throughput (No Latency Constraint)
+- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
+```
+
+#### 2. Maximize Throughput with a Latency Requirement
+- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MIN_CACHE_HIT_PCT=60
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+## Output
+
+After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
+
+- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
+    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
+    - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+
+- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
+
+```
+# Example result.txt content
+hash:a1b2c3d4...
+max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
+max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
+...
+best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
+```
+
+  If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
+
+- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
+
+## How It Works
+
+The script follows a systematic process to find the optimal parameters:
+
+1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
+
+2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
+
+3. **Latency-Aware Throughput Search**: For each parameter combination:
+    - The vLLM server is started.
+    - A benchmark is first run with an infinite request rate (`--request-rate inf`).
+    - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
+    - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
+
+4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
+
+5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -1,36 +1,7 @@
 #!/bin/bash

 # This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
-# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
-# It also supports additional requirement: e2e latency and prefix cache. 
-
-# Pre-requisite:
-# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
-# 2. If the model is customized, replace the MODEL's config with the customized config.
-# 3. Set variables (ALL REQUIRED)
-#   BASE: your directory for vllm repo
-#   MODEL: the model served by vllm
-#   SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
-#   TP: ways of tensor parallelism
-#   DOWNLOAD_DIR: directory to download and load model weights.
-#   INPUT_LEN: request input len
-#   OUTPUT_LEN: request output len
-#   MIN_CACHE_HIT_PCT: prefix cache rate
-#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
-#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
-#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
-#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
-# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
-# 5. The final result will be saved in RESULT file. 
-
-
-# Example use cases 
-# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
-# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
-# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
+# See details in README (benchmarks/auto_tune/README.md).

 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -80,6 +80,11 @@ def bench_run(
        a, score, topk, renormalize=False
    )

+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
    def run_triton_moe(
        a: torch.Tensor,
        w1: torch.Tensor,
@ -111,6 +116,10 @@ def bench_run(
        w2: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        per_act_token: bool,
@ -125,6 +134,10 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -136,6 +149,10 @@ def bench_run(
        w2_q: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
@ -150,6 +167,10 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -194,6 +215,10 @@ def bench_run(
            w2_q,
            w1_scale,
            w2_scale,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
            topk_weights,
            topk_ids,
        )
@ -231,6 +256,10 @@ def bench_run(
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
        "per_act_token": per_act_token,
+        "ab_strides1": ab_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides1": c_strides1,
+        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -289,6 +318,10 @@ def bench_run(
        w2_q,
        w1_scale,
        w2_scale,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
        topk_weights,
        topk_ids,
        per_act_token,
@ -297,7 +330,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -586,6 +586,11 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+        E = config.num_experts
+        topk = config.moe_topk[0]
+        intermediate_size = config.moe_intermediate_size[0]
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Support for llama4
        config = config.get_text_config()
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ b/benchmarks/kv_cache/benchmark_block_pool.py
@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from typing import Optional
+
+from tabulate import tabulate
+
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+class Metric:
+    def __init__(self) -> None:
+        self.cnt: int = 0
+        self.sum_v: int = 0
+        self.max_v: Optional[int] = None
+
+    def update(self, v: int) -> None:
+        self.cnt += 1
+        self.sum_v += v
+        if self.max_v is None:
+            self.max_v = v
+        else:
+            self.max_v = max(self.max_v, v)
+
+    def avg_v(self) -> float:
+        return self.sum_v * 1.0 / self.cnt
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_metric: Metric = Metric()
+        free_blocks_metric: Metric = Metric()
+        for _ in range(args.num_iteration):
+            t1 = time.monotonic_ns()
+            blocks = block_pool.get_new_blocks(allocate_block)
+            t2 = time.monotonic_ns()
+            block_pool.free_blocks(blocks)
+            t3 = time.monotonic_ns()
+            get_blocks_metric.update(t2 - t1)
+            free_blocks_metric.update(t3 - t2)
+
+        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
+            rows.append(
+                [
+                    get_blocks_metric.cnt,
+                    args.num_gpu_blocks,
+                    allocate_block,
+                    get_blocks_metric.avg_v() / 1000000,
+                    get_blocks_metric.max_v / 1000000.0,
+                    free_blocks_metric.avg_v() / 1000000,
+                    free_blocks_metric.max_v / 1000000.0,
+                ]
+            )
+        else:
+            print(
+                "No valid metrics found."
+                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
+            )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (ms)",
+                "Get Blocks\nMax (ms)",
+                "Free Blocks\nAvg (ms)",
+                "Free Blocks\nMax (ms)",
+            ],
+            tablefmt="grid",
+            floatfmt=".6f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -24,6 +24,7 @@

 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
+#include "cuda_compat.h"

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -33,12 +34,6 @@ typedef __hip_bfloat16 __nv_bfloat16;
  #include "../quantization/fp8/nvidia/quant_utils.cuh"
 #endif

-#ifndef USE_ROCM
-  #define WARP_SIZE 32
-#else
-  #define WARP_SIZE warpSize
-#endif
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@ -670,7 +665,6 @@ __global__ void paged_attention_v2_reduce_kernel(

 }  // namespace vllm

-#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -0,0 +1,372 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+/*!
+ \file
+ \brief An universal device layer for cutlass 3.x-style kernels.
+*/
+
+// clang-format off
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+#include "../kernel/sm100_fmha_mla_tma_warpspecialized.hpp"
+#include "../kernel/sm100_fmha_mla_reduction.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::fmha::device {
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class Kernel_
+>
+class MLA {
+public:
+
+  using Kernel = Kernel_;
+
+  using ReductionKernel = cutlass::fmha::kernel::Sm100FmhaMlaReductionKernel<
+      typename Kernel::ElementOut,
+      typename Kernel::ElementAcc,
+      typename Kernel::ElementAcc,
+      Kernel::TileShapeH::value,
+      Kernel::TileShapeL::value,
+      256 /*Max split*/
+  >;
+
+  /// Argument structure: User API
+  using KernelArguments = typename Kernel::Arguments;
+  using ReductionArguments = typename ReductionKernel::Arguments;
+
+  using Arguments = KernelArguments;
+
+  /// Argument structure: Kernel API
+  using KernelParams = typename Kernel::Params;
+  using ReductionParams = typename ReductionKernel::Params;
+  struct Params {
+    KernelParams fmha_params;
+    ReductionParams reduction_params;
+  };
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+  bool is_initialized(bool set = false) {
+    static bool initialized = false;
+    if (set) initialized = true;
+    return initialized;
+  }
+
+  static ReductionArguments to_reduction_args(Arguments const& args) {
+    auto [H, K, D, B] = args.problem_shape;
+    return ReductionArguments{
+      nullptr, args.epilogue.ptr_o, nullptr, args.epilogue.ptr_lse,
+      args.mainloop.softmax_scale, B, args.split_kv, K, args.mainloop.ptr_seq,
+      args.ptr_split_kv, Kernel::TileShapeS::value
+    };
+  }
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  static void set_split_kv (KernelArguments& args) {
+    // printf("set_split_kv start");
+    if (args.split_kv >= 1) return;
+    auto [H, K, D, B] = args.problem_shape;
+    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
+    int sm_count = args.hw_info.sm_count;
+    // printf("    sm_count = %d\n", sm_count);
+    int max_splits = ceil_div(K, 128);
+    max_splits = min(16, max_splits);
+    // printf("    max_splits = %d\n", max_splits);
+    int sms_per_batch = max(1, sm_count / B);
+    // printf("    sms_per_batch = %d\n", sms_per_batch);
+    int split_heur = min(max_splits, sms_per_batch);
+    int waves = ceil_div(B * split_heur, sm_count);
+    int k_waves = ceil_div(max_splits, split_heur);
+    int split_wave_aware = ceil_div(max_splits, k_waves);
+    args.split_kv = split_wave_aware;
+    // printf("    args.split_kv = %d\n", args.split_kv);
+
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (! Kernel::can_implement(args)) {
+      return Status::kInvalid;
+    }
+    if (! ReductionKernel::can_implement(to_reduction_args(args))) {
+      return Status::kInvalid;
+    }
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += Kernel::get_workspace_size(args);
+    workspace_bytes += ReductionKernel::get_workspace_size(to_reduction_args(args));
+    return workspace_bytes;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("MLA::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = Kernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<Kernel>,
+        Kernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = Kernel::initialize_workspace(args, workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    status = ReductionKernel::initialize_workspace(to_reduction_args(args), workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    KernelParams kernel_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = kernel_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = kernel_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {kernel_params, reduction_params};
+
+    if (is_initialized()) return Status::kSuccess;
+
+    // account for dynamic smem capacity if needed
+    // no dynamic smem is needed for reduction kernel
+    int smem_size = Kernel::SharedStorageSize;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    is_initialized(true);
+
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("MLA()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    auto fmha_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = fmha_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = fmha_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {fmha_params, reduction_params};
+
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::run()");
+    dim3 const block = Kernel::get_block_shape();
+    dim3 const grid = Kernel::get_grid_shape(params.fmha_params);
+
+    // configure smem size and carveout
+    int smem_size = Kernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr(Kernel::ArchTag::kMinComputeCapability >= 90) {
+      dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
+                   cute::size<1>(typename Kernel::ClusterShape{}),
+                   cute::size<2>(typename Kernel::ClusterShape{}));
+      void const* kernel = (void const*) device_kernel<Kernel>;
+      void* kernel_params[] = {&params.fmha_params};
+      launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
+    }
+    else {
+      launch_result = Status::kSuccess;
+      device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params.fmha_params);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess != result or Status::kSuccess != launch_result) {
+      //return Status::kSuccess;
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+    if (params.reduction_params.split_kv > 1) {
+      // launch reduction kernel
+      dim3 const block = ReductionKernel::get_block_shape();
+      dim3 const grid  = ReductionKernel::get_grid_shape(params.reduction_params);
+      device_kernel<ReductionKernel><<<grid, block, 0, stream>>>(params.reduction_params);
+      cudaError_t result = cudaGetLastError();
+      if (cudaSuccess == result) {
+        return Status::kSuccess;
+      }
+      else {
+        CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    return run(args, workspace, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::device
+
+////////////////////////////////////////////////////////////////////////////////
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/arch.h"
+#include "cute/tensor.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+template<
+    class ElementOut,
+    class ElementAcc,
+    class ElementScale,
+    size_t kNumHeads,
+    size_t kHeadDimLatent,
+    int kMaxSplits
+>
+struct Sm100FmhaMlaReductionKernel {
+
+  static const int SharedStorageSize = 0;
+  static const int MaxThreadsPerBlock = 128;
+  static const int MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = cutlass::arch::Sm100;
+
+  static_assert(kHeadDimLatent % MaxThreadsPerBlock == 0);
+  struct Arguments {
+    ElementAcc* ptr_oaccum = nullptr;
+    ElementOut* ptr_o = nullptr;
+    ElementAcc* ptr_lseaccum = nullptr;
+    ElementAcc* ptr_lse = nullptr;
+    ElementScale scale = 1.f;
+    int num_batches = 0;
+    int split_kv = -1;
+    int dim_k = -1;
+    int* ptr_seq = nullptr;
+    int* ptr_split_kv = nullptr;
+    int tile_shape_s = 128;
+  };
+  using Params = Arguments;
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return {args.ptr_oaccum, args.ptr_o, args.ptr_lseaccum, args.ptr_lse,
+	    args.scale, args.num_batches, args.split_kv, args.dim_k, args.ptr_seq,
+	    args.ptr_split_kv, args.tile_shape_s};
+  }
+
+  static size_t get_workspace_size(Arguments const& /*args*/) {
+    return 0;
+  }
+
+  static Status initialize_workspace(
+      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return dim3(kNumHeads, 1, params.num_batches);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    if (args.num_batches <= 0) return false;
+    if (args.split_kv <= 0) return false;
+    return true;
+  }
+
+  CUTLASS_DEVICE void operator() (Params const& params, char* smem_raw) {
+    if (params.split_kv <= 1) return;
+    auto blk_coord = make_coord(blockIdx.x, _0{}, blockIdx.z);
+
+    __shared__ ElementAcc sLseScale[kMaxSplits];
+    const size_t offset_lseaccum = get<0>(blk_coord) + kNumHeads * params.split_kv * get<2>(blk_coord);
+    const size_t offset_lse = get<0>(blk_coord) + kNumHeads * get<2>(blk_coord);
+
+    Tensor gLSEaccum = make_tensor(make_gmem_ptr(params.ptr_lseaccum + offset_lseaccum),
+                                   make_shape(params.split_kv), Stride<Int<kNumHeads>>{});
+
+    Tensor gLSE = make_tensor(make_gmem_ptr(params.ptr_lse + offset_lse),
+                              Shape<_1>{}, Stride<_1>{});
+
+    auto dim_k = params.ptr_seq == nullptr ?  params.dim_k : params.ptr_seq[get<2>(blk_coord)];
+    auto local_split_kv = params.ptr_split_kv == nullptr ? params.split_kv : params.ptr_split_kv[get<2>(blk_coord)];
+    auto k_tile_total = ceil_div(dim_k, params.tile_shape_s);
+    auto k_tile_per_cta = ceil_div(k_tile_total, local_split_kv);
+    local_split_kv = ceil_div(k_tile_total, k_tile_per_cta);
+
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    if (warp_idx == 0) {
+      constexpr int kNLsePerThread = cute::ceil_div(kMaxSplits, 32);
+
+      ElementAcc local_lse[kNLsePerThread];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        local_lse[i] = split < local_split_kv ? gLSEaccum(split) : -std::numeric_limits<ElementAcc>::infinity();
+      }
+
+      ElementAcc lse_max = -std::numeric_limits<ElementAcc>::infinity();
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        lse_max = max(lse_max, local_lse[i]);
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        lse_max = max(lse_max, __shfl_xor_sync(0xffffffff, lse_max, offset));
+      }
+      lse_max = lse_max == -std::numeric_limits<ElementAcc>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = __shfl_sync(0xffffffff, lse_max, 0);
+
+      ElementAcc sum_lse = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        sum_lse = sum_lse + expf(local_lse[i] - lse_max);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        sum_lse = sum_lse + __shfl_xor_sync(0xffffffff, sum_lse, offset);
+      }
+
+      sum_lse = __shfl_sync(0xffffffff, sum_lse, 0);
+
+      ElementAcc global_lse = (sum_lse == 0.f || sum_lse != sum_lse) ? std::numeric_limits<ElementAcc>::infinity() : logf(sum_lse) + lse_max;
+      if (threadIdx.x == 0 and params.ptr_lse != nullptr) {
+        gLSE(0) = global_lse;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        if (split < local_split_kv) {
+          sLseScale[split] = expf(local_lse[i] - global_lse);
+        }
+      }
+    }
+    __syncthreads();
+
+    constexpr int Elements = kHeadDimLatent / MaxThreadsPerBlock;
+    const size_t offset_oaccum = kHeadDimLatent * params.split_kv * (get<0>(blk_coord) + kNumHeads * get<2>(blk_coord));
+    Tensor gOaccum = make_tensor(make_gmem_ptr(params.ptr_oaccum + offset_oaccum),
+                               Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+    ElementAcc local_val[Elements] = {0};
+    for (int split = 0; split < local_split_kv; ++split) {
+      ElementAcc lse_scale = sLseScale[split];
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < Elements; ++i) {
+        local_val[i] += lse_scale * gOaccum(threadIdx.x + MaxThreadsPerBlock * i);
+      }
+      gOaccum.data() = gOaccum.data() + kHeadDimLatent;
+    }
+    auto ptr_o_local = params.ptr_o + (get<0>(blk_coord) + get<2>(blk_coord) * kNumHeads) * kHeadDimLatent;
+    Tensor gO = make_tensor(make_gmem_ptr(ptr_o_local), Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < Elements; ++i) {
+      gO(threadIdx.x + MaxThreadsPerBlock * i) = static_cast<ElementOut>(local_val[i]);
+    }
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::fmha::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+  };
+
+  bool valid_ = true;
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler(Params const&) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    dim3 grid(get<0>(cluster_shape), get<3>(problem_shape) /* Batch */, split_kv /*Maximum Split KV*/);
+    return Params{ grid };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return valid_;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    return make_coord(blockIdx.x, _0{}, blockIdx.y, blockIdx.z);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler& operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaPersistentTileScheduler {
+
+  struct Params {
+    int num_blocks;
+    FastDivmod divmod_m_block;
+    FastDivmod divmod_b;
+    FastDivmod divmod_split_kv;
+    KernelHardwareInfo hw_info;
+  };
+
+  int block_idx = 0;
+  Params params;
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler(Params const& params) : block_idx(blockIdx.x), params(params) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = hw_info.sm_count;
+    if (sm_count <= 1 || sm_count % size<0>(cluster_shape) != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    hw_info.sm_count = sm_count;
+
+    int num_m_blocks = size<0>(cluster_shape);
+    int num_blocks = num_m_blocks * get<3>(problem_shape)  /* Batch */;
+    num_blocks *= split_kv; /* Maximum Split KV*/
+
+    return Params {
+      num_blocks,
+      { num_m_blocks}, { get<3>(problem_shape) }, {split_kv},
+      hw_info
+    };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    dim3 grid(std::min(params.num_blocks, params.hw_info.sm_count), 1, 1);
+    return grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return block_idx < params.num_blocks;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = block_idx;
+    int m_block, bidb, n_split_kv;
+    params.divmod_m_block(block_decode, m_block, block_decode);
+    params.divmod_b(block_decode, bidb, block_decode);
+    params.divmod_split_kv(block_decode, n_split_kv, block_decode);
+    return make_coord(m_block, _0{}, bidb, n_split_kv);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler& operator++() {
+    block_idx += gridDim.x;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::kernel
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -0,0 +1,283 @@
+/*
+Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/*
+ * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
+ * by Alcanderian JieXin Liang
+ */
+#include "core/registration.h"
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <iostream>
+
+#include "cutlass_sm100_mla/device/sm100_mla.hpp"
+#include "cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp"
+
+// clang-format off
+#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
+void sm100_cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
+}
+int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_get_workspace_size");
+}
+#else
+
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <bool v>
+struct IsPersistent {
+  static const bool value = v;
+};
+
+template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler, Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+      TileShape,
+      Element,
+      ElementAcc,
+      ElementOut,
+      ElementAcc,
+      TileScheduler,
+      /*kIsCpAsync=*/!IsPaged128>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  int batches = q_nope.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
+  int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  float scale = float(sm_scale);
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_nope = cute::make_tuple(
+      static_cast<int64_t>(q_nope.stride(1)), _1{}, static_cast<int64_t>(q_nope.stride(0)));
+  StrideQ stride_Q_pe = cute::make_tuple(
+      static_cast<int64_t>(q_pe.stride(1)), _1{}, static_cast<int64_t>(q_pe.stride(0)));
+
+  StrideK stride_C = cute::make_tuple(
+      static_cast<int64_t>(0 + D_latent + D_rope), _1{}, static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{}, static_cast<int64_t>(0 + H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_nope_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_pe_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale,
+       Q_nope_ptr,
+       stride_Q_nope,
+       Q_pe_ptr,
+       stride_Q_pe,
+       C_ptr,
+       stride_C,
+       C_ptr + D_latent,
+       stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()),
+       stride_PT,
+       page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      // TODO(trevor-m): Change split_kv back to -1 when
+      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
+      // perform worse with larger context length and smaller batch sizes.
+      num_kv_splits, // split_kv
+      nullptr,       // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element, bool IsPaged128, typename PersistenceOption>
+void runMla(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    at::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits,
+    cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+#define DISPATCH_BOOL(expr, const_expr, ...) \
+  [&]() -> bool {                            \
+    if (expr) {                              \
+      constexpr bool const_expr = true;      \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      constexpr bool const_expr = false;     \
+      return __VA_ARGS__();                  \
+    }                                        \
+  }()
+
+void sm100_cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  auto in_dtype = q_nope.dtype();
+  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  const int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  
+  // NOTE(alcanderian): IsPersistent has bug with manual split_kv.
+  // Kernel will hang if batch is too large with large num_kv_splits. (for example bs=8, num_kv_splits=8)
+  // Maybe per batch split kv will fix this.
+  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
+    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
+      if (in_dtype == at::ScalarType::Half) {
+        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::BFloat16) {
+        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else {
+        TORCH_CHECK(false, "Unsupported input data type of MLA");
+      }
+      return true;
+    });
+    return true;
+  });
+}
+
+int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
+  // which are float, so Element type here doesn't matter.
+  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+
+  // Get split kv. Requires problem shape and sm_count only.
+  typename MlaSm100Type::Fmha::Arguments arguments;
+  using TileShapeH = typename MlaSm100Type::TileShapeH;
+  using TileShapeD = typename MlaSm100Type::TileShapeD;
+  arguments.problem_shape =
+      cute::make_tuple(TileShapeH{}, static_cast<int>(max_seq_len), TileShapeD{}, static_cast<int>(num_batches));
+  // Assumes device 0 when getting sm_count.
+  arguments.hw_info.sm_count =
+      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
+  arguments.split_kv = num_kv_splits;
+  MlaSm100Type::Fmha::set_split_kv(arguments);
+
+  return MlaSm100Type::Fmha::get_workspace_size(arguments);
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
+  m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
+}
+
+// clang-format on
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@ -18,12 +18,7 @@
 */

 #include "attention_kernels.cuh"
-
-#ifndef USE_ROCM
-  #define WARP_SIZE 32
-#else
-  #define WARP_SIZE warpSize
-#endif
+#include "cuda_compat.h"

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -187,7 +182,6 @@ void paged_attention_v1(
                             CALL_V1_LAUNCHER_BLOCK_SIZE)
 }

-#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@ -18,12 +18,7 @@
 */

 #include "attention_kernels.cuh"
-
-#ifndef USE_ROCM
-  #define WARP_SIZE 32
-#else
-  #define WARP_SIZE warpSize
-#endif
+#include "cuda_compat.h"

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -197,7 +192,6 @@ void paged_attention_v2(
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
 }

-#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/cpu/sgl-kernels/common.h
+++ b/csrc/cpu/sgl-kernels/common.h
@ -58,7 +58,7 @@ namespace {

 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_LAST_DIM_CONTIGUOUS(x) \
-  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")

 #define CHECK_INPUT(x) \
  CHECK_CPU(x);        \
--- a/csrc/cpu/sgl-kernels/gemm.h
+++ b/csrc/cpu/sgl-kernels/gemm.h
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
    int64_t topk,
    int64_t num_tokens_post_pad);

-// shared expert implememntation for int8 w8a8
+// shared expert implementation for int8 w8a8
 template <typename scalar_t>
 void shared_expert_int8_kernel_impl(
    scalar_t* __restrict__ output,
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
    __m512  vd0;
    __m512  vd1[COLS];

-    // oops! 4x4 spills but luckly we use 4x2
+    // oops! 4x4 spills but luckily we use 4x2
    __m512 vbias[COLS];

    // [NOTE]: s8s8 igemm compensation in avx512-vnni
--- a/csrc/cpu/sgl-kernels/vec.h
+++ b/csrc/cpu/sgl-kernels/vec.h
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
 #define CVT_FP16_TO_FP32(a) \
    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))

-// this doesn't hanel NaN.
+// this doesn't handle NaN.
 inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);

--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@ -4,10 +4,10 @@
  #include <hip/hip_runtime.h>
 #endif

-#ifndef USE_ROCM
-  #define WARP_SIZE 32
+#if defined(USE_ROCM) && defined(__GFX9__)
+  #define WARP_SIZE 64
 #else
-  #define WARP_SIZE warpSize
+  #define WARP_SIZE 32
 #endif

 #ifndef USE_ROCM
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -7,7 +7,11 @@

 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
-#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#ifdef USE_ROCM
+    #include <c10/hip/HIPException.h>  // For C10_HIP_CHECK and C10_HIP_KERNEL_LAUNCH_CHECK
+#else
+    #include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#endif

 #ifndef USE_ROCM
    #include <cub/block/block_load.cuh>
@ -320,8 +324,13 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
                dim3 grid(params.batch, params.dim / kNRows);
                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
                if (kSmemSize >= 48 * 1024) {
+#ifdef USE_ROCM
+                    C10_HIP_CHECK(hipFuncSetAttribute(
+                        reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+#else
                    C10_CUDA_CHECK(cudaFuncSetAttribute(
                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+#endif
                }
                kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
                C10_CUDA_KERNEL_LAUNCH_CHECK();
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -160,6 +160,30 @@ __global__ void shuffleInputRowsKernel(const T* input,
  }
 }

+template <typename T>
+__global__ void shuffleInputRowsKernelSlow(const T* input,
+                                           const int32_t* dst2src_map,
+                                           T* output, int64_t num_src_rows,
+                                           int64_t num_dst_rows,
+                                           int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Duplicate and permute rows
+    auto const* source_row_ptr = input + source_row_idx * num_cols;
+    auto* dest_row_ptr = output + dest_row_idx * num_cols;
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+
+    for (int elem_index = start_offset; elem_index < num_cols;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
 void shuffle_rows(const torch::Tensor& input_tensor,
                  const torch::Tensor& dst2src_map,
                  torch::Tensor& output_tensor) {
@ -173,17 +197,24 @@ void shuffle_rows(const torch::Tensor& input_tensor,
  int64_t const num_src_rows = input_tensor.size(0);
  int64_t const num_cols = input_tensor.size(1);

-  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
-              "num_cols must be divisible by 128 / "
-              "sizeof(input_tensor.scalar_type()) / 8");
-
-  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
-    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
-        dst2src_map.data_ptr<int32_t>(),
-        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
-        num_dest_rows, num_cols);
-  });
+  if (num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)) {
+    // use slow kernel if num_cols can't be aligned to 128 bits
+    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+      shuffleInputRowsKernelSlow<scalar_t><<<blocks, threads, 0, stream>>>(
+          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+          dst2src_map.data_ptr<int32_t>(),
+          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+          num_dest_rows, num_cols);
+    });
+  } else {
+    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+      shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+          dst2src_map.data_ptr<int32_t>(),
+          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+          num_dest_rows, num_cols);
+    });
+  }
 }

 #else
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@ -29,19 +29,36 @@ struct sm90_fp8_config_default {

 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M16 {
-  // M in [1, 16]
+struct sm90_fp8_config_M4 {
+  // M in [1, 4]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
  using EpilogueSchedule =
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;

  using Cutlass3xGemm =
      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+                            KernelSchedule, EpilogueSchedule, true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in (4, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule, true>;
 };

 template <typename InType, typename OutType,
@ -102,7 +119,9 @@ void run_cutlass_moe_mm_sm90(
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
+  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
@ -111,7 +130,18 @@ void run_cutlass_moe_mm_sm90(
  uint32_t const n = out_tensors.size(1);
  uint32_t const k = a_tensors.size(1);

-  if (n >= 8192) {
+  // Use swap_ab for M <= 64 by default to reduce padding
+  if (m <= 4) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (n >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
@ -121,11 +151,6 @@ void run_cutlass_moe_mm_sm90(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
        per_out_ch);
-  } else if (m <= 16) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
  } else {
    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@ -22,14 +22,23 @@ using ArchTag = cutlass::arch::Sm90;
 using OperatorClass = cutlass::arch::OpClassTensorOp;

 using LayoutA = cutlass::layout::RowMajor;
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
 using LayoutB = cutlass::layout::ColumnMajor;
-using LayoutC = cutlass::layout::RowMajor;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+using LayoutD = cutlass::layout::RowMajor;
+using LayoutD_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+using LayoutC = LayoutD;
+using LayoutC_Transpose = LayoutD_Transpose;

 template <typename ElementAB_, typename ElementC_,
          template <typename, typename, typename> typename Epilogue_,
          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
+          typename EpilogueSchedule, bool swap_ab_ = false>
 struct cutlass_3x_group_gemm {
+  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = ElementAB_;
  using ElementC = void;
  using ElementD = ElementC_;
@ -37,9 +46,6 @@ struct cutlass_3x_group_gemm {

  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;

-  using StrideC =
-      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
-
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
@ -50,19 +56,26 @@ struct cutlass_3x_group_gemm {
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, TileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
-          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAccumulator, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose*, LayoutC*>, AlignmentC,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose*, LayoutD*>,
+          AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;

  static constexpr size_t CEStorageSize =
      sizeof(typename CollectiveEpilogue::SharedStorage);
  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(CEStorageSize)>;

-  using CollectiveMainloop =
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutB_Transpose*, AlignmentAB,
+          ElementAB, LayoutA_Transpose*, AlignmentAB, ElementAccumulator,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp,
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
-          Stages, KernelSchedule>::CollectiveOp;
+          Stages, KernelSchedule>::CollectiveOp>;

  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
@ -78,12 +91,12 @@ void cutlass_group_gemm_caller(
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

  int num_experts = static_cast<int>(expert_offsets.size(0));
-  int k_size = a_tensors.size(1);
-  int n_size = out_tensors.size(1);

  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

@ -110,19 +123,35 @@ void cutlass_group_gemm_caller(
          problem_sizes.data_ptr());
  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};

-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(a_strides.data_ptr()),
-      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(b_strides.data_ptr())};
+  typename GemmKernel::MainloopArguments mainloop_args;
+  if constexpr (swap_ab) {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr()),
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr())};
+  } else {
+    mainloop_args = typename GemmKernel::MainloopArguments{
+        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+        static_cast<StrideB*>(b_strides.data_ptr())};
+  }

  // Currently, we are only able to do broadcast on either all or none a_scales
  // and on either all or none b_scales
  typename GemmKernel::EpilogueArguments epilogue_args{
      Gemm::Epilogue::prepare_args(
-          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-          per_act_token, per_out_ch),
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr()),
+          swap_ab ? static_cast<const ElementAccumulator**>(
+                        a_scales_ptrs.data_ptr())
+                  : static_cast<const ElementAccumulator**>(
+                        b_scales_ptrs.data_ptr()),
+          swap_ab ? per_out_ch : per_act_token,
+          swap_ab ? per_act_token : per_out_ch),
      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(c_strides.data_ptr())};
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@ -6,7 +6,10 @@
 #include <iostream>

 constexpr uint64_t THREADS_PER_EXPERT = 512;
+// threshold must match the dispatch logic in run_cutlass_moe_mm_sm90()
+constexpr int SWAP_AB_THRESHOLD = 64;

+template <bool SWAP_AB>
 __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
                                      int32_t* problem_sizes1,
                                      int32_t* problem_sizes2,
@ -24,40 +27,53 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,

  if (threadIdx.x == 0) {
    int final_occurrences = atomic_buffer[expert_id];
-    problem_sizes1[expert_id * 3] = final_occurrences;
-    problem_sizes1[expert_id * 3 + 1] = 2 * n;
-    problem_sizes1[expert_id * 3 + 2] = k;
-    problem_sizes2[expert_id * 3] = final_occurrences;
-    problem_sizes2[expert_id * 3 + 1] = k;
-    problem_sizes2[expert_id * 3 + 2] = n;
+    if constexpr (!SWAP_AB) {
+      problem_sizes1[expert_id * 3] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 1] = 2 * n;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 1] = k;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    } else {
+      problem_sizes1[expert_id * 3] = 2 * n;
+      problem_sizes1[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes1[expert_id * 3 + 2] = k;
+      problem_sizes2[expert_id * 3] = k;
+      problem_sizes2[expert_id * 3 + 1] = final_occurrences;
+      problem_sizes2[expert_id * 3 + 2] = n;
+    }
  }
 }

 __global__ void compute_expert_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* atomic_buffer, const int num_experts) {
+    int32_t* atomic_buffer, const int num_experts, const int topk_length) {
  int32_t tot_offset = 0;
  expert_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
    atomic_buffer[i] = tot_offset;
-    tot_offset += problem_sizes1[i * 3];
+    tot_offset += topk_length > SWAP_AB_THRESHOLD ? problem_sizes1[i * 3]
+                                                  : problem_sizes1[i * 3 + 1];
    expert_offsets[i + 1] = tot_offset;
  }
 }

 __global__ void compute_expert_blockscale_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* blockscale_offsets, int32_t* atomic_buffer,
-    const int num_experts) {
+    int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
+    const int topk_length) {
  int32_t tot_offset = 0;
  int32_t tot_offset_round = 0;
  expert_offsets[0] = 0;
  blockscale_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
+    int32_t cur_offset = topk_length > SWAP_AB_THRESHOLD
+                             ? problem_sizes1[i * 3]
+                             : problem_sizes1[i * 3 + 1];
    atomic_buffer[i] = tot_offset;
-    tot_offset += problem_sizes1[i * 3];
+    tot_offset += cur_offset;
    expert_offsets[i + 1] = tot_offset;
-    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
+    tot_offset_round += (cur_offset + (128 - 1)) / 128 * 128;
    blockscale_offsets[i + 1] = tot_offset_round;
  }
 }
@ -102,22 +118,36 @@ void get_cutlass_moe_mm_data_caller(
  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);

  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
-  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
-      static_cast<int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(problem_sizes2.data_ptr()),
-      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
+
+  if (topk_ids.numel() > SWAP_AB_THRESHOLD) {
+    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
+        static_cast<const int32_t*>(topk_ids.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
+        k);
+  } else {
+    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+        static_cast<const int32_t*>(topk_ids.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
+        k);
+  }
+
  if (blockscale_offsets.has_value()) {
    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        topk_ids.numel());
  } else {
    compute_expert_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
+        topk_ids.numel());
  }
  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
      static_cast<const int32_t*>(topk_ids.data_ptr()),
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -20,13 +20,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
  //

-  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
+  // so we need
  // to override this for many GEMMs with the following tag. Otherwise,
  // torch.compile will force all input tensors to be contiguous(), which
  // will break many custom ops that require column-major weight matrices.
-  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
-  // to match exact eager-mode strides.
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+  // This was a bug and PyTorch 2.7 has since fixed this.
+#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
+  #define stride_tag at::Tag::needs_fixed_stride_order
+#else
+  #define stride_tag
+#endif

  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@ -514,6 +518,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                   Tensor page_table, float scale) -> ()");
  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);

+  // SM100 CUTLASS MLA decode
+  ops.def(
+      "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
+      "                         Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
+      "                         Tensor page_table, Tensor workspace, float "
+      "scale,"
+      "                         int num_kv_splits) -> ()");
+  // conditionally compiled so impl in source file
+
+  // SM100 CUTLASS MLA workspace
+  ops.def(
+      "sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
+      "                                     int sm_count, int num_kv_splits) "
+      "-> int");
+  // conditionally compiled so impl in source file
+
  // Compute NVFP4 block quantized tensor.
  ops.def(
      "scaled_fp4_quant(Tensor! output, Tensor input,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}

-# Flag enables build-in KV-connector dependency libs into docker images
+# Flag enables built-in KV-connector dependency libs into docker images
 ARG INSTALL_KV_CONNECTORS=false

 #################### BASE BUILD IMAGE ####################
@ -207,6 +207,19 @@ ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# Flag to control whether to use pre-built vLLM wheels
+ARG VLLM_USE_PRECOMPILED
+# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
+ENV VLLM_USE_PRECOMPILED=""
+RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
+        export VLLM_USE_PRECOMPILED=1 && \
+        echo "Using precompiled wheels"; \
+    else \
+        unset VLLM_USE_PRECOMPILED && \
+        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
+    fi
+
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
@ -375,48 +388,33 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl

-# Allow specifying a version, Git revision or local .whl file
-ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
-ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
+# Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.8rc1"
-# Flag to control whether to use pre-built FlashInfer wheels (set to false to force build from source)
-# TODO: Currently disabled because the pre-built wheels are not available for FLASHINFER_GIT_REF
-ARG USE_FLASHINFER_PREBUILT_WHEEL=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
-  if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
-      # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
-      if [[ "$CUDA_VERSION" == 12.8* ]] && [[ "$USE_FLASHINFER_PREBUILT_WHEEL" == "true" ]]; then
-          uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
-      else
-          # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-          # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-          if [[ "${CUDA_VERSION}" == 11.* ]]; then
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-          elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-          else
-              # CUDA 12.8+ supports 10.0a and 12.0
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-          fi
-          echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-
-          git clone --depth 1 --recursive --shallow-submodules \
-            --branch ${FLASHINFER_GIT_REF} \
-            ${FLASHINFER_GIT_REPO} flashinfer
-
-          # Needed to build AOT kernels
-          pushd flashinfer
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-              python3 -m flashinfer.aot
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-              uv pip install --system --no-build-isolation .
-          popd
-
-          rm -rf flashinfer
-      fi \
-  fi
+    git clone --depth 1 --recursive --shallow-submodules \
+        --branch ${FLASHINFER_GIT_REF} \
+        ${FLASHINFER_GIT_REPO} flashinfer
+    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+    if [[ "${CUDA_VERSION}" == 11.* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+    else
+        # CUDA 12.8+ supports 10.0a and 12.0
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    fi
+    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+    # Needed to build AOT kernels
+    pushd flashinfer
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            python3 -m flashinfer.aot
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            uv pip install --system --no-build-isolation .
+    popd
+    rm -rf flashinfer
 BASH
 COPY examples examples
 COPY benchmarks benchmarks
@ -508,10 +506,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        uv pip install --system -r requirements/kv_connectors.txt; \
    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        BITSANDBYTES_VERSION="0.42.0"; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
-    fi
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -95,7 +95,7 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
-    sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,21 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-RUN pip install -v -r requirements/hpu.txt
-
-ENV no_proxy=localhost,127.0.0.1
-ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-
-RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="6487649"
+ARG AITER_BRANCH="916bf3c"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -1,5 +1,5 @@
-ARG NIGHTLY_DATE="20250124"
-ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+ARG NIGHTLY_DATE="20250714"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
 WORKDIR /workspace/vllm
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -8,7 +8,6 @@ API documentation for vLLM's configuration classes.

 - [vllm.config.ModelConfig][]
 - [vllm.config.CacheConfig][]
- [vllm.config.TokenizerPoolConfig][]
 - [vllm.config.LoadConfig][]
 - [vllm.config.ParallelConfig][]
 - [vllm.config.SchedulerConfig][]
--- a/docs/assets/deployment/open_webui.png
+++ b/docs/assets/deployment/open_webui.png
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@ -1,3 +1,7 @@
+---
+toc_depth: 4
+---
+
 # vLLM CLI Guide

 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -37,8 +41,15 @@ Start the vLLM OpenAI Compatible API server.

    # To search by keyword
    vllm serve --help=max
+
+    # To view full help with pager (less/more)
+    vllm serve --help=page
    ```

+### Options
+
+--8<-- "docs/argparse/serve.md"
+
 ## chat

 Generate chat completions via the running API server.
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments

 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available CLI arguments, run `vllm serve --help`!
+To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!

 ## Configuration file

--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@ -3,6 +3,15 @@
 [](){ #deployment-anyscale }

 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
-It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
+
+Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
+
+## Production-ready vLLM on Anyscale quickstarts
+
+- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
+- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
+- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
+- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -1,26 +1,42 @@
 # Open WebUI

-1. Install the [Docker](https://docs.docker.com/engine/install/)
+[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
+and user-friendly self-hosted AI platform designed to operate entirely offline.
+It supports various LLM runners like Ollama and OpenAI-compatible APIs,
+with built-in RAG capabilities, making it a powerful AI deployment solution.

-2. Start the vLLM server with the supported chat completion model, e.g.
+To get started with Open WebUI using vLLM, follow these steps:

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+1. Install the [Docker](https://docs.docker.com/engine/install/).

-1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
+2. Start the vLLM server with a supported chat completion model:

-```bash
-docker run -d -p 3000:8080 \
--name open-webui \
-v open-webui:/app/backend/data \
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
--restart always \
-ghcr.io/open-webui/open-webui:main
-```
+    ```console
+    vllm serve Qwen/Qwen3-0.6B-Chat
+    ```

-1. Open it in the browser: <http://open-webui-host:3000/>
+    !!! note
+        When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
+        For example:

-On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+        ```console
+        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
+        ```

-![](../../assets/deployment/open_webui.png)
+3. Start the Open WebUI Docker container:
+
+    ```console
+    docker run -d \
+        --name open-webui \
+        -p 3000:8080 \
+        -v open-webui:/app/backend/data \
+        -e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
+        --restart always \
+        ghcr.io/open-webui/open-webui:main
+    ```
+
+4. Open it in the browser: <http://open-webui-host:3000/>
+
+    At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
+
+    ![Web portal of model Qwen/Qwen3-0.6B-Chat](../../assets/deployment/open_webui.png)
--- a/docs/deployment/integrations/kuberay.md
+++ b/docs/deployment/integrations/kuberay.md
@ -0,0 +1,20 @@
+# KubeRay
+
+[KubeRay](https://github.com/ray-project/kuberay) provides a Kubernetes-native way to run vLLM workloads on Ray clusters.
+A Ray cluster can be declared in YAML, and the operator then handles pod scheduling, networking configuration, restarts, and blue-green deployments — all while preserving the familiar Kubernetes experience.
+
+## Why KubeRay instead of manual scripts?
+
+| Feature | Manual scripts | KubeRay |
+|---------|-----------------------------------------------------------|---------|
+| Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
+| Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
+| Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
+| Declarative config | Bash flags & environment variables | Git-ops-friendly YAML CRDs (RayCluster/RayService) |
+
+Using KubeRay reduces the operational burden and simplifies integration of Ray + vLLM with existing Kubernetes workflows (CI/CD, secrets, storage classes, etc.).
+
+## Learn more
+
+* ["Serve a Large Language Model using Ray Serve LLM on Kubernetes"](https://docs.ray.io/en/master/cluster/kubernetes/examples/rayserve-llm-example.html) - An end-to-end example of how to serve a model using vLLM, KubeRay, and Ray Serve.
+* [KubeRay documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html)
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -13,6 +13,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [KServe](integrations/kserve.md)
+- [KubeRay](integrations/kuberay.md)
 - [kubernetes-sigs/lws](frameworks/lws.md)
 - [meta-llama/llama-stack](integrations/llamastack.md)
 - [substratusai/kubeai](integrations/kubeai.md)
--- a/docs/design/v1/metrics.md
+++ b/docs/design/v1/metrics.md
@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../..

 ### Grafana Dashboard

-vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.

 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:

--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -31,7 +31,7 @@ Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (cur

 ## KV Cache Transfer Methods

-There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache.
+There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache.

 Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.

@ -39,13 +39,13 @@ Experimental results have shown that the performance of these methods, from high

 As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.

-Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself.
+Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVCache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVCache data itself.

-When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size.
+When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size.

 ## NCCL Group Topology

-Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
+Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.

 ![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36)

@ -53,33 +53,17 @@ Each NCCL group occupies a certain amount of GPU memory buffer for communication

 ## GPU Memory Buffer and Tensor Memory Pool

-The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.
+The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%～10% of the memory size.

-If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
+If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVCache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVCache loss. Once KVCache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.

-To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
+To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.

 # Install vLLM

-??? console "Commands"
-
-    ```shell
-    # Enter the home directory or your working directory.
-    cd /home
-
-    # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
-    wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-    # Download the code repository.
-    git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
-    cd vllm
-
-    # Set the installation package path.
-    export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-    # installation
-    pip install -e . -v
-    ```
+```shell
+pip install "vllm>=0.9.2"
+```

 # Run xPyD

@ -90,7 +74,7 @@ To address the above issues, I have designed and developed a local Tensor memory
 - You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
 - `PUT_ASYNC` offers the best performance and should be prioritized.
 - The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
+- The `disagg_proxy_p2p_nccl_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
 - The node running the proxy must have `quart` installed.
 - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
 - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
@ -100,8 +84,8 @@ To address the above issues, I have designed and developed a local Tensor memory
 ### Proxy (e.g. 10.0.1.1)

 ```shell
-cd {your vllm directory}/examples/online_serving/disagg_xpyd/
-python3 disagg_prefill_proxy_xpyd.py &
+cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
@ -111,7 +95,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20005 \
+        --port 20001 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -123,7 +107,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
@ -133,7 +117,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20009 \
+        --port 20002 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -145,7 +129,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
@ -167,7 +151,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
@ -177,7 +161,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20008 \
+        --port 20004 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -189,7 +173,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
    ```

 ## Run 3P1D
@ -197,8 +181,8 @@ python3 disagg_prefill_proxy_xpyd.py &
 ### Proxy (e.g. 10.0.1.1)

 ```shell
-cd {your vllm directory}/examples/online_serving/disagg_xpyd/
-python3 disagg_prefill_proxy_xpyd.py &
+cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```

 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
@ -208,7 +192,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20005 \
+        --port 20001 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -220,7 +204,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
    ```

 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
@ -230,7 +214,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20009 \
+        --port 20002 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -242,7 +226,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
    ```

 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
@ -264,7 +248,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.9 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
    ```

 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
@ -274,7 +258,7 @@ python3 disagg_prefill_proxy_xpyd.py &
    ```shell
    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
        --host 0.0.0.0 \
-        --port 20008 \
+        --port 20004 \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --served-model-name base_model \
@ -286,7 +270,7 @@ python3 disagg_prefill_proxy_xpyd.py &
        --gpu-memory-utilization 0.7 \
        --disable-log-request \
        --kv-transfer-config \
-        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
    ```

 # Single request
@ -334,24 +318,6 @@ pgrep python | xargs kill -9 && pkill -f python

 # Test data

-## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s
- **1P5D (6×A800) vs vLLM (1×A800)**:
-  - Throughput ↑7.2% (1085 → 6979/6)
-  - ITL (P99) ↓81.3% (120ms → 22.9ms)
-  - TTFT (P99) ↑26.8% (175ms → 222ms)
-  - TPOT: No change
+## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s

- **1P6D (7×A800) vs vLLM (1×A800)**:
-  - Throughput ↑9.6% (1085 → 8329/7)
-  - ITL (P99) ↓81.0% (120ms → 22.7ms)
-  - TTFT (P99) ↑210% (175ms →543ms)
-  - TPOT: No change
-
-## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s
- **1P1D (2×A800) vs vLLM (1×A800)**:
-  - Throughput ↑37.4% (537 → 1476/2)
-  - ITL (P99) ↓81.8% (127ms → 23.1ms)
-  - TTFT (P99) ↑41.8% (160ms → 227ms)
-  - TPOT: No change
-
-![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627)
+![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1)
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -10,6 +10,7 @@ Contents:
 - [BitBLAS](bitblas.md)
 - [GGUF](gguf.md)
 - [GPTQModel](gptqmodel.md)
+- [INC](inc.md)
 - [INT4 W4A16](int4.md)
 - [INT8 W8A8](int8.md)
 - [FP8 W8A8](fp8.md)
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@ -0,0 +1,56 @@
+---
+title: FP8 INC
+---
+[](){ #inc }
+
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
+Currently, quantization is validated only in Llama models.
+
+Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
+[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
+
+!!! note
+    Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
+
+!!! note
+    `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
+    The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
+
+## Run Online Inference Using FP8
+
+Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
+
+```bash
+export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
+vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
+```
+
+!!! tip
+    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
+
+!!! tip
+    When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
+    `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
+    `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
+
+## Run Offline Inference Using FP8
+
+To run offline inference (after completing the model calibration process):
+
+* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
+* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
+* Call shutdown method of the model_executor at the end of the run.
+
+```python
+from vllm import LLM
+llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
+...
+# Call llm.generate on the required prompts and sampling params.
+...
+llm.llm_engine.model_executor.shutdown()
+```
+
+## Device for the Model's Weights Uploading
+
+The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
+This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@ -2,18 +2,19 @@

 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:

-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
-| AWQ                   | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
-| Marlin (GPTQ/AWQ/FP8) | ❌       | ❌        | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ✅︎                | ✅︎           |
-| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ✅︎                | ❌            |
-| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎        | ❌        | ✅︎          | ❌         | ✅︎        | ❌           | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌      | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌        | ❌          | ❌         | ✅︎        | ✅︎           | ✅︎            |
+| FP8 (W8A8)            | ❌      | ❌      | ❌       | ✅︎    | ✅︎      | ✅︎         | ❌          | ❌         | ❌        | ✅︎           | ❌           |
+| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌         | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌      | ❌      | ❌    | ❌      | ❌        | ❌          | ✅︎         | ❌         | ❌           | ❌          |

 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -14,6 +14,7 @@ vLLM currently supports the following reasoning models:
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |

 !!! note
    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -103,9 +103,7 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m

 vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.

-By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
-
-Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
+However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.

 ## Automatic Function Calling

@ -290,6 +288,16 @@ Supported models:

 Flags: `--tool-call-parser kimi_k2`

+### Hunyuan Models (`hunyuan_a13b`)
+
+Supported models:
+
+* `tencent/Hunyuan-A13B-Instruct` (chat template already included huggingface model file.)
+
+Flags:
+* For non-reasoning: `--tool-call-parser hunyuan_a13b`
+* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
+
 ### Models with Pythonic Tool Calls (`pythonic`)

 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -94,8 +94,8 @@ Currently, there are no pre-built CPU wheels.
 ## Related runtime environment variables

 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
+- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).

@ -123,9 +123,13 @@ export VLLM_CPU_NUM_OF_RESERVED_CPU=1
 vllm serve facebook/opt-125m --dtype=bfloat16
 ```

+Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
+
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?

- Bind each OpenMP thread to a dedicated physical CPU core respectively, or use auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following.
+
+- On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:

 ??? console "Commands"

--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@ -37,7 +37,7 @@ information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp

 - Google Cloud TPU VM
 - TPU versions: v6e, v5e, v5p, v4
- Python: 3.10 or newer
+- Python: 3.11 or newer

 ### Provision Cloud TPUs

@ -117,7 +117,7 @@ source ~/.bashrc
 Create and activate a Conda environment for vLLM:

 ```bash
-conda create -n vllm python=3.10 -y
+conda create -n vllm python=3.12 -y
 conda activate vllm
 ```

--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@ -28,7 +28,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
 hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
 apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
 pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor is installed
+pip list | grep neural # verify that neural_compressor_pt is installed
 ```

 Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
@ -120,12 +120,13 @@ docker run \
 - Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
  for accelerating low-batch latency and throughput
 - Attention with Linear Biases (ALiBi)
+- INC quantization

 ### Unsupported features

 - Beam search
 - LoRA adapters
- Quantization
+- AWQ quantization
 - Prefill chunking (mixed-batch inferencing)

 ### Supported configurations
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()

 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
+from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
 from vllm.utils import FlexibleArgumentParser  # noqa: E402

 logger = logging.getLogger("mkdocs")
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
 class MarkdownFormatter(HelpFormatter):
    """Custom formatter that generates markdown for argument groups."""

-    def __init__(self, prog):
+    def __init__(self, prog, starting_heading_level=3):
        super().__init__(prog,
                         max_help_position=float('inf'),
                         width=float('inf'))
+        self._section_heading_prefix = "#" * starting_heading_level
+        self._argument_heading_prefix = "#" * (starting_heading_level + 1)
        self._markdown_output = []

    def start_section(self, heading):
        if heading not in {"positional arguments", "options"}:
-            self._markdown_output.append(f"\n### {heading}\n\n")
+            heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
+            self._markdown_output.append(heading_md)

    def end_section(self):
        pass
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):

    def add_arguments(self, actions):
        for action in actions:
+            if (len(action.option_strings) == 0
+                    or "--help" in action.option_strings):
+                continue

            option_strings = f'`{"`, `".join(action.option_strings)}`'
-            self._markdown_output.append(f"#### {option_strings}\n\n")
+            heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
+            self._markdown_output.append(heading_md)

            if choices := action.choices:
                choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
        return cls.add_cli_args(parser, **kwargs)


+def create_serve_parser() -> FlexibleArgumentParser:
+    """Create a parser for the serve command with markdown formatting."""
+    parser = FlexibleArgumentParser()
+    parser.formatter_class = lambda prog: MarkdownFormatter(
+        prog, starting_heading_level=4)
+    return make_arg_parser(parser)
+
+
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    logger.info("Generating argparse documentation")
    logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "engine_args": create_parser(EngineArgs),
        "async_engine_args": create_parser(AsyncEngineArgs,
                                           async_args_only=True),
+        "serve": create_serve_parser(),
    }

    # Generate documentation for each parser
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -316,6 +316,7 @@ Specified using `--task generate`.
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
 | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
@ -327,9 +328,11 @@ Specified using `--task generate`.
 | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
-| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | | ✅︎ | ✅︎ |
-| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | | ✅︎ | ✅︎ |
+| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
 | `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
 | `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -349,7 +352,7 @@ Specified using `--task generate`.
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
-| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | | | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -358,6 +361,7 @@ Specified using `--task generate`.
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
+| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -379,9 +383,9 @@ Specified using `--task generate`.
 | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ |
+| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -572,7 +576,7 @@ Specified using `--task generate`.
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinkg`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
@ -580,14 +584,15 @@ Specified using `--task generate`.
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
-| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
+| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
+| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
-| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
@ -595,7 +600,7 @@ Specified using `--task generate`.
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
+| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@ -57,12 +57,20 @@ vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 4
                  --data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
 ```

-This DP mode can also be used with Ray, in which case only a single launch command is needed irrespective of the number of nodes:
+This DP mode can also be used with Ray by specifying `--data-parallel-backend=ray`:

 ```bash
-vllm serve $MODEL --data-parallel-size 16 --tensor-parallel-size 2 --data-parallel-backend=ray
+vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
+                  --data-parallel-backend=ray
 ```

+There are several notable differences when using Ray:
+
+- A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
+- There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
+- There is no need to specify `--data-parallel-rpc-port`
+- Remote DP ranks will be allocated based on node resources of the Ray cluster
+
 Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.

 When deploying large DP sizes using this method, the API server process can become a bottleneck. In this case, the orthogonal `--api-server-count` command line option can be used to scale this out (for example `--api-server-count=4`). This is transparent to users - a single HTTP endpoint / port is still exposed. Note that this API server scale-out is "internal" and still confined to the "head" node.
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@ -30,8 +30,31 @@ This API adds several batteries-included capabilities that simplify large-scale,
 - Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance.
 - Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization.
 - Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference.
+- Reading and writing to most popular file formats and cloud object storage.
+- Scaling up the workload without code changes.

-The following example shows how to run batched inference with Ray Data and vLLM:
-<gh-file:examples/offline_inference/batch_llm_inference.py>
+??? code
+
+    ```python
+    import ray  # Requires ray>=2.44.1
+    from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
+
+    config = vLLMEngineProcessorConfig(model_source="unsloth/Llama-3.2-1B-Instruct")
+    processor = build_llm_processor(
+        config,
+        preprocess=lambda row: {
+            "messages": [
+                {"role": "system", "content": "You are a bot that completes unfinished haikus."},
+                {"role": "user", "content": row["item"]},
+            ],
+            "sampling_params": {"temperature": 0.3, "max_tokens": 250},
+        },
+        postprocess=lambda row: {"answer": row["generated_text"]},
+    )
+
+    ds = ray.data.from_items(["An old silent pond..."])
+    ds = processor(ds)
+    ds.write_parquet("local:///tmp/data/")
+    ```

 For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -106,14 +106,13 @@ to enable simultaneous generation and embedding using the same engine instance i

 Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
 Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
-(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
+(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
 enforcing eager mode and disabling prefix caching in V1.

 Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
 these models currently require enforcing eager mode, disabling prefix caching, and using the FlashInfer attention
-backend in V1. It is also necessary to pass a non-standard block size for attention layers (this is not possible
-using the `vllm serve` CLI yet).
+backend in V1.

 #### Encoder-Decoder Models

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -10,7 +10,7 @@ on HuggingFace model repository.

 import os
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import Any, NamedTuple, Optional

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -30,7 +30,9 @@ question_per_audio_count = {

 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: str
+    prompt: Optional[str] = None
+    prompt_token_ids: Optional[dict[str, list[int]]] = None
+    multi_modal_data: Optional[dict[str, Any]] = None
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None

@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


+# Voxtral
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.messages import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+
+    multi_modal_data = {"audio": audios_and_sr}
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
+
+
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
    # NOTE - the setting in this example are somehat different than what is
@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:


 model_example_map = {
+    "voxtral": run_voxtral,
    "granite_speech": run_granite_speech,
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
@ -311,16 +368,24 @@ def main(args):
        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
    )

-    mm_data = {}
-    if audio_count > 0:
-        mm_data = {
-            "audio": [
-                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
-            ]
-        }
+    mm_data = req_data.multi_modal_data
+    if not mm_data:
+        mm_data = {}
+        if audio_count > 0:
+            mm_data = {
+                "audio": [
+                    asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
+                ]
+            }

    assert args.num_prompts > 0
-    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
+    inputs = {"multi_modal_data": mm_data}
+
+    if req_data.prompt:
+        inputs["prompt"] = req_data.prompt
+    else:
+        inputs["prompt_token_ids"] = req_data.prompt_token_ids
+
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@ -1,17 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-a simple demonstration of RLHF with vLLM, inspired by
-the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
-It follows the design that, training processes and inference processes
-are different, and they live on different GPUs.
-Training processes send prompts to inference processes to generate data,
-and also synchronize the weights of the model by broadcasting the weights
-from the training process to the inference process.
-Note that this is a simple demonstration of one training instance and one
-inference instance. In practice, there could be multiple training instances
-and multiple inference instances. For the full implementation, please refer
-to the OpenRLHF framework.
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies GPU 0 for training, whereas a
+tensor-parallel vLLM inference engine occupies GPU 1–2.
+
+The example performs the following steps:
+
+* Load the training model on GPU 0.
+* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
+  and Ray placement groups.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group. Note that
+  for demonstration purposes we simply zero out the weights.
+
+For a production-ready implementation that supports multiple training and
+inference replicas, see the OpenRLHF framework:
+https://github.com/OpenRLHF/OpenRLHF
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
 """

 import os
@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port


 class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
    def __init__(self, *args, **kwargs):
-        # a hack to make the script work.
-        # stop ray from manipulating CUDA_VISIBLE_DEVICES
-        # at the top-level
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
+        # so that vLLM can manage its own device placement within the worker.
        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        super().__init__(*args, **kwargs)


-"""
-Start the training process, here we use huggingface transformers 
-as an example to hold a model on GPU 0.
-"""
-
+# Load the OPT-125M model onto GPU 0 for the training workload.
 train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
 train_model.to("cuda:0")
-"""
-Start the inference process, here we use vLLM to hold a model on GPU 1 and 
-GPU 2. For the details on how to use ray, please refer to the ray 
-documentation https://docs.ray.io/en/latest/ .
-"""
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
 os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
 ray.init()

+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/placement-groups.html
 pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
 ray.get(pg_inference.ready())
 scheduling_inference = PlacementGroupSchedulingStrategy(
@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy(
    placement_group_capture_child_tasks=True,
    placement_group_bundle_index=0,
 )
-"""
-launch the vLLM inference engine.
-here we use `enforce_eager` to reduce the start time.
-"""
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
 llm = ray.remote(
    num_cpus=0,
    num_gpus=0,
@ -74,7 +85,7 @@ llm = ray.remote(
    distributed_executor_backend="ray",
 )

-# Generate texts from the prompts.
+# Generate text from the prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@ -93,8 +104,8 @@ for output in outputs:
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-" * 50)

-# set up the communication between the training process
-# and the inference engine.
+# Set up the communication channel between the training process and the
+# inference engine.
 master_address = get_ip()
 master_port = get_open_port()

@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group(
 )
 ray.get(handle)

-# simulate training, modify the weights of the model.
+# Simulate a training step by zeroing out all model weights.
+# In a real RLHF training loop the weights would be updated using the gradient
+# from an RL objective such as PPO on a reward model.
 for name, p in train_model.named_parameters():
    p.data.zero_()

-# sync weight from the training process to the inference engine.
+# Synchronize the updated weights to the inference engine.
 for name, p in train_model.named_parameters():
    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
    ray.get(handle)

-# check if the weights are updated.
+# Verify that the inference weights have been updated.
 assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))

-# use the updated model to generate texts, they will be nonsense
-# because the weights are all zeros.
+# Generate text with the updated model. The output is expected to be nonsense
+# because the weights are zero.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
 print("-" * 50)
 for output in outputs_updated:
--- a/examples/offline_inference/skip_loading_weights_in_engine_init.py
+++ b/examples/offline_inference/skip_loading_weights_in_engine_init.py
@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, RequestOutput, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def print_prompts_and_outputs(outputs: list[RequestOutput]) -> None:
+    print("-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+def main():
+    # Create an LLM without loading real weights
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        load_format="dummy",
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs do not make sense:")
+    print_prompts_and_outputs(outputs)
+
+    # Update load format from `dummy` to `auto`
+    llm.collective_rpc(
+        "update_config", args=({"load_config": {"load_format": "auto"}},)
+    )
+    # Now reload real weights inplace
+    llm.collective_rpc("reload_weights")
+
+    # Check outputs make sense
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs make sense after loading real weights:")
+    print_prompts_and_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -84,6 +84,7 @@ def main():
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
        disable_log_stats=False,
+        max_model_len=16384,
    )

    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Nemontron_VL
+def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    assert modality == "image"
+    placeholder = "<image>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@ -1186,6 +1224,7 @@ model_example_map = {
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
+    "nemotron_vl": run_nemotron_vl,
    "keye_vl": run_keye_vl,
    "kimi_vl": run_kimi_vl,
    "llava": run_llava,
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@ -4,7 +4,9 @@
 import os
 import socket
 import threading
+import time
 import uuid
+from typing import Any

 import aiohttp
 import msgpack
@ -12,12 +14,25 @@ import zmq
 from quart import Quart, make_response, request

 count = 0
-prefill_instances: dict[str, str] = {}  # http_address: zmq_address
-decode_instances: dict[str, str] = {}  # http_address: zmq_address
+prefill_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
+decode_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)

 prefill_cv = threading.Condition()
 decode_cv = threading.Condition()

+DEFAULT_PING_SECONDS = 5
+
+
+def _remove_oldest_instances(instances: dict[str, Any]) -> None:
+    oldest_key = next(iter(instances), None)
+    while oldest_key is not None:
+        value = instances[oldest_key]
+        if value[1] > time.time():
+            break
+        print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
+        instances.pop(oldest_key, None)
+        oldest_key = next(iter(instances), None)
+

 def _listen_for_register(poller, router_socket):
    while True:
@ -31,12 +46,23 @@ def _listen_for_register(poller, router_socket):
                global prefill_instances
                global prefill_cv
                with prefill_cv:
-                    prefill_instances[data["http_address"]] = data["zmq_address"]
+                    node = prefill_instances.pop(data["http_address"], None)
+                    prefill_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(prefill_instances)
+
            elif data["type"] == "D":
                global decode_instances
                global decode_cv
                with decode_cv:
-                    decode_instances[data["http_address"]] = data["zmq_address"]
+                    node = decode_instances.pop(data["http_address"], None)
+                    decode_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(decode_instances)
            else:
                print(
                    "Unexpected, Received message from %s, data: %s",
@ -44,6 +70,9 @@ def _listen_for_register(poller, router_socket):
                    data,
                )

+            if node is None:
+                print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
+

 def start_service_discovery(hostname, port):
    if not hostname:
@ -105,12 +134,14 @@ async def handle_request():
        with prefill_cv:
            prefill_list = list(prefill_instances.items())
            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+            prefill_zmq_addr = prefill_zmq_addr[0]

        global decode_instances
        global decode_cv
        with decode_cv:
            decode_list = list(decode_instances.items())
            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+            decode_zmq_addr = decode_zmq_addr[0]

        print(
            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
--- a/examples/online_serving/elastic_ep/bench.sh
+++ b/examples/online_serving/elastic_ep/bench.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+HOST="localhost"
+PORT=8006
+NUM_PROMPTS=20
+REQUEST_RATE=5
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --num-prompts)
+            NUM_PROMPTS="$2"
+            shift 2
+            ;;
+        --request-rate)
+            REQUEST_RATE="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model MODEL_NAME           Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
+            echo "  --local-model                Use local model path (convenience option)"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --num-prompts $NUM_PROMPTS \
+    --request-rate $REQUEST_RATE
--- a/examples/online_serving/elastic_ep/scale.py
+++ b/examples/online_serving/elastic_ep/scale.py
@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import sys
+
+import requests
+
+
+def scale(host, port, new_dp_size):
+    url = f"http://{host}:{port}/scale_elastic_ep"
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    print(f"Sending scale request to {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text}")
+
+        if response.status_code == 200:
+            print("Scale up/down request successful!")
+            return True
+        else:
+            print("Scale up/down request failed!")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test scale up/down functionality")
+    parser.add_argument("--host", default="localhost", help="API server host")
+    parser.add_argument("--port", type=int, default=8006, help="API server port")
+    parser.add_argument(
+        "--new-dp-size", type=int, default=2, help="New data parallel size"
+    )
+
+    args = parser.parse_args()
+
+    success = scale(args.host, args.port, args.new_dp_size)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@ -0,0 +1,72 @@
+#!/bin/bash
+
+HOST="0.0.0.0"
+PORT=8006
+DATA_PARALLEL_SIZE=4
+REDUNDANT_EXPERTS=0
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dp)
+            DATA_PARALLEL_SIZE="$2"
+            shift 2
+            ;;
+        --re)
+            REDUNDANT_EXPERTS="$2"
+            shift 2
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --dp SIZE                    Set data parallel size (default: 4)"
+            echo "  --re SIZE                    Set redundant experts (default: 0)"
+            echo "  --host HOST                  Set host address (default: 0.0.0.0)"
+            echo "  --port PORT                  Set port number (default: 8006)"
+            echo "  --model MODEL_NAME           Set model name or path"
+            echo "  -h, --help                   Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
+
+export RAY_DEDUP_LOGS=0
+export VLLM_USE_V1=1
+export VLLM_ALL2ALL_BACKEND="pplx"
+export VLLM_USE_DEEP_GEMM=1
+
+vllm serve $MODEL_NAME \
+    --data-parallel-size $DATA_PARALLEL_SIZE \
+    --data-parallel-size-local $DATA_PARALLEL_SIZE \
+    --data-parallel-backend ray \
+    --enforce-eager \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --num-redundant-experts $REDUNDANT_EXPERTS \
+    --trust-remote-code \
+    --host $HOST \
+    --port $PORT
--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@ -1,35 +1,81 @@
 #!/bin/bash
+#
+# Launch a Ray cluster inside Docker for vLLM inference.
+#
+# This script can start either a head node or a worker node, depending on the
+# --head or --worker flag provided as the third positional argument.
+#
+# Usage:
+# 1. Designate one machine as the head node and execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --head \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<head_node_ip>
+#
+# 2. On every worker machine, execute:
+#    bash run_cluster.sh \
+#         vllm/vllm-openai \
+#         <head_node_ip> \
+#         --worker \
+#         /abs/path/to/huggingface/cache \
+#         -e VLLM_HOST_IP=<worker_node_ip>
+# 
+# Each worker requires a unique VLLM_HOST_IP value.
+# Keep each terminal session open. Closing a session stops the associated Ray
+# node and thereby shuts down the entire cluster.
+# Every machine must be reachable at the supplied IP address.
+#
+# The container is named "node-<random_suffix>". To open a shell inside
+# a container after launch, use:
+#       docker exec -it node-<random_suffix> /bin/bash
+#
+# Then, you can execute vLLM commands on the Ray cluster as if it were a
+# single machine, e.g. vllm serve ...
+#
+# To stop the container, use:
+#       docker stop node-<random_suffix>

-# Check for minimum number of required arguments
+# Check for minimum number of required arguments.
 if [ $# -lt 4 ]; then
-    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
+    echo "Usage: $0 docker_image head_node_ip --head|--worker path_to_hf_home [additional_args...]"
    exit 1
 fi

-# Assign the first three arguments and shift them away
+# Extract the mandatory positional arguments and remove them from $@.
 DOCKER_IMAGE="$1"
 HEAD_NODE_ADDRESS="$2"
-NODE_TYPE="$3"  # Should be --head or --worker
+NODE_TYPE="$3"  # Should be --head or --worker.
 PATH_TO_HF_HOME="$4"
 shift 4

-# Additional arguments are passed directly to the Docker command
+# Preserve any extra arguments so they can be forwarded to Docker.
 ADDITIONAL_ARGS=("$@")

-# Validate node type
+# Validate the NODE_TYPE argument.
 if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
    echo "Error: Node type must be --head or --worker"
    exit 1
 fi

-# Define a function to cleanup on EXIT signal
+# Generate a unique container name with random suffix.
+# Docker container names must be unique on each host.
+# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
+# for example, on a multi-GPU machine.
+CONTAINER_NAME="node-${RANDOM}"
+
+# Define a cleanup routine that removes the container when the script exits.
+# This prevents orphaned containers from accumulating if the script is interrupted.
 cleanup() {
-    docker stop node
-    docker rm node
+    docker stop "${CONTAINER_NAME}"
+    docker rm "${CONTAINER_NAME}"
 }
 trap cleanup EXIT

-# Command setup for head or worker node
+# Build the Ray start command based on the node role.
+# The head node manages the cluster and accepts connections on port 6379, 
+# while workers connect to the head's address.
 RAY_START_CMD="ray start --block"
 if [ "${NODE_TYPE}" == "--head" ]; then
    RAY_START_CMD+=" --head --port=6379"
@ -37,11 +83,15 @@ else
    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
 fi

-# Run the docker command with the user specified parameters and additional arguments
+# Launch the container with the assembled parameters.
+# --network host: Allows Ray nodes to communicate directly via host networking
+# --shm-size 10.24g: Increases shared memory
+# --gpus all: Gives container access to all GPUs on the host
+# -v HF_HOME: Mounts HuggingFace cache to avoid re-downloading models
 docker run \
    --entrypoint /bin/bash \
    --network host \
-    --name node \
+    --name "${CONTAINER_NAME}" \
    --shm-size 10.24g \
    --gpus all \
    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
--- a/examples/tool_chat_template_hunyuan_a13b.jinja
+++ b/examples/tool_chat_template_hunyuan_a13b.jinja
@ -0,0 +1,113 @@
+{% set loop_messages = messages %}
+{% if tools %}
+    {% set weekday_map = {'Monday': '星期一', 'Tuesday': '星期二', 'Wednesday': '星期三', 'Thursday': '星期四', 'Friday': '星期五', 'Saturday': '星期六', 'Sunday': '星期日'} %}
+    {% set weekday_cn = weekday_map[strftime_now('%A')] %}
+    {% set datetime_str = strftime_now('%Y-%m-%d %H:%M:%S') %}
+    {% set datetime_str = datetime_str + ' ' + weekday_cn %}
+    {% for message in loop_messages %}
+        {% if 'content' in message %}
+            {% set content = message['content'] %}
+        {% else %}
+            {% set content = '' %}
+        {% endif %}
+        {% if loop.index0 == 0 %}
+            {% set content_tmp = '你是一位函数组合专家。你会得到一个问题和一组可能的函数。根据问题，你需要进行一个或多个函数/工具调用以实现目的。
+如果没有一个函数可以使用，请直接使用自然语言回复用户，以助手：开头。
+如果给定的问题缺少函数所需的参数，请使用自然语言进行提问，向用户询问必要信息，以助手：开头。
+如果调用结果已经足够回答用户问题，请对历史结果进行总结，使用自然语言回复用户，以助手：开头。
+你应该只在工具调用部分返回函数调用。如果你决定调用任何函数，你必须将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>。你不应该在回复中包含任何其他文本。以下是你可以调用的函数列表，格式为JSON。
+' %}
+            {% set content_tmp = content_tmp + '
+' + tools | tojson + '
+' %}
+            {% if message['role'] == 'system' %}
+                {% set content_tmp = content_tmp + '
+额外要求：
+' + content + '
+
+如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。如果额外要求里有格式要求，请忽略，以此处为准。
+否则，请参考开头说的三种情况，以助手：开头进行回复。
+
+如果额外要求里有时间信息，就以额外要求里的时间为准，否则，参考当前时间：' + datetime_str %}
+                {% set content = '<|startoftext|>' + content_tmp + '<|extra_4|>' %}
+            {% elif message['role'] == 'user' %}
+                {% set content_tmp = content_tmp + '
+如果你决定返回函数调用，请将其格式化为<tool_calls>[{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},...]</tool_calls>，不得包含其他文本。
+否则，请参考开头说的三种情况，以助手：开头进行回复。
+
+当前时间：' + datetime_str %}
+                {% set content_tmp = '<|startoftext|>' + content_tmp + '<|extra_4|>'%}
+                {% set content = content_tmp + '用户：' + content + '<|extra_0|>' %}
+            {% endif %}
+        {% else %}
+            {% if message['role'] == 'user' %}
+                {% set content = '用户：' + content + '<|extra_0|>' %}
+            {% elif message['role'] == 'assistant' %}
+                {% if 'tool_calls' in message %}
+                    {% set tool_calls = message['tool_calls'] %}
+                    {% set ns = namespace(tool_calls="[") %}
+                    {% for tool_call in tool_calls %}
+                        {% set function = tool_call['function'] %}
+                        {% set name = function['name'] %}
+                        {% set ns.tool_calls = ns.tool_calls + '{"name": "' + name + '", '%}
+                        {% set arguments = function['arguments'] %}
+                        {% if arguments is not string %}
+                            {% set arguments = arguments | tojson %}
+                        {% endif %}
+                        {% set ns.tool_calls = ns.tool_calls + '"arguments": ' + arguments + '}' %}
+                        {% if not loop.last %}
+                            {% set ns.tool_calls = ns.tool_calls + ', '%}
+                        {% endif %}
+                    {% endfor %}
+                    {% set ns.tool_calls = ns.tool_calls + ']' %}
+                    {% set content = content + '<tool_calls>' + ns.tool_calls + '</tool_calls>' %}
+                {% else %}
+                    {% set content = '助手：' + content %}
+                {% endif %}
+                {% set content = content + '<|eos|>' %}
+            {% elif message['role'] == 'tool' %}
+                {% if content is not string %}
+                    {set content = content | tojson }
+                {% endif %}
+                {% set content = '<tool_response>' + content + '</tool_response>' %}
+                {% set content = content + '<|extra_0|>' %}
+            {% endif %}
+        {% endif %}
+    {{- content -}}
+    {% endfor %}
+{% else %}
+    {% set context = {'has_head': true} %}
+    {% for message in loop_messages %}
+        {% if 'content' in message %}
+            {% set content = message['content'] %}
+        {% else %}
+            {% set content = '' %}
+        {% endif %}
+        {% if loop.index0 == 0 %}
+            {% if content == '' %}
+                {% set _ = context.update({'has_head': false}) %}
+            {% elif message['role'] == 'system' %}
+                {% set content = '<|startoftext|>' + content + '<|extra_4|>' %}
+            {% endif %}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+            {% if loop.index0 == 1 and not context.has_head %}
+                {% set content = '<|startoftext|>' + content %}
+            {% endif %}
+            {% if loop.index0 == 1 and context.has_head %}
+                {% set content = content + '<|extra_0|>' %}
+            {% else %}
+                {% set content = '<|startoftext|>' + content + '<|extra_0|>' %}
+            {% endif %}
+        {% elif message['role'] == 'assistant' %}
+            {% set content = content + '<|eos|>' %}
+        {% elif message['role'] == 'tool' %}
+            {% set content = content + '<|extra_0|>' %}
+        {% endif %}
+        {{- content -}}
+    {% endfor %}
+{% endif %}
+{%- if enable_thinking is defined and enable_thinking is false %}
+    {{- '<think>\n\n</think>\n' }}
+{%- endif %}
+
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -61,6 +61,7 @@ plugins:
  - search
  - autorefs
  - awesome-nav
+  - glightbox
  # For API reference generation
  - api-autonav:
      modules: ["vllm"]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<80.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.7.0",
+    "torch == 2.7.1",
    "wheel",
    "jinja2",
 ]
@ -73,7 +73,6 @@ line-length = 80
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
-"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 # Python 3.8 typing - skip utils for ROCm
 "vllm/utils/__init__.py" = ["UP006", "UP035"]
@ -174,3 +173,186 @@ respect-ignore-files = true

 [tool.ty.environment]
 python = "./.venv"
+
+[tool.typos.files]
+# these files may be written in non english words
+extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
+    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
+    "vllm/third_party/*"]
+ignore-hidden = true
+ignore-files = true
+ignore-dot = true
+ignore-vcs = true
+ignore-global = true
+ignore-parent = true
+
+[tool.typos.default]
+binary = false
+check-filename = false
+check-file = true
+unicode = true
+ignore-hex = true
+identifier-leading-digits = false
+locale = "en"
+extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
+    ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
+     ".*[Tt]h[rR].*"]
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.default.extend-identifiers]
+bbc5b7ede = "bbc5b7ede"
+womens_doubles = "womens_doubles"
+v_2nd = "v_2nd"
+# splitted_input = "splitted_input"
+NOOPs = "NOOPs"
+typ = "typ"
+nin_shortcut = "nin_shortcut"
+UperNetDecoder = "UperNetDecoder"
+subtile = "subtile"
+cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
+SFOuput = "SFOuput"
+# huggingface transformers repo uses these words
+depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
+DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
+depthwise_seperable_CNN = "depthwise_seperable_CNN"
+
+[tool.typos.default.extend-words]
+iy = "iy"
+tendencias = "tendencias"
+# intel cpu features
+tme = "tme"
+dout = "dout"
+Pn = "Pn"
+arange = "arange"
+
+[tool.typos.type.py]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.py.extend-identifiers]
+arange = "arange"
+NDArray = "NDArray"
+EOFError = "EOFError"
+fo = "fo"
+ba = "ba"
+
+[tool.typos.type.py.extend-words]
+
+[tool.typos.type.cpp]
+extend-glob = ["*.cu"]
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.cpp.extend-identifiers]
+countr_one = "countr_one"
+k_ot = "k_ot"
+ot = "ot"
+
+[tool.typos.type.cpp.extend-words]
+
+[tool.typos.type.rust]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.rust.extend-identifiers]
+flate2 = "flate2"
+
+[tool.typos.type.rust.extend-words]
+ser = "ser"
+
+[tool.typos.type.lock]
+extend-glob = []
+check-file = false
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.lock.extend-identifiers]
+
+[tool.typos.type.lock.extend-words]
+
+[tool.typos.type.jl]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.jl.extend-identifiers]
+
+[tool.typos.type.jl.extend-words]
+modul = "modul"
+egals = "egals"
+usig = "usig"
+egal = "egal"
+
+[tool.typos.type.go]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.go.extend-identifiers]
+flate = "flate"
+
+[tool.typos.type.go.extend-words]
+
+[tool.typos.type.css]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.css.extend-identifiers]
+nd = "nd"
+
+[tool.typos.type.css.extend-words]
+
+[tool.typos.type.man]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.man.extend-identifiers]
+Nd = "Nd"
+
+[tool.typos.type.man.extend-words]
+
+[tool.typos.type.cert]
+extend-glob = []
+check-file = false
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.cert.extend-identifiers]
+
+[tool.typos.type.cert.extend-words]
+
+[tool.typos.type.sh]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.sh.extend-identifiers]
+ot = "ot"
+
+[tool.typos.type.sh.extend-words]
+
+[tool.typos.type.vimscript]
+extend-glob = []
+extend-ignore-identifiers-re = []
+extend-ignore-words-re = []
+extend-ignore-re = []
+
+[tool.typos.type.vimscript.extend-identifiers]
+windo = "windo"
+
+[tool.typos.type.vimscript.extend-words]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.0
+torch==2.7.1
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.51.1
+transformers >= 4.53.2
 huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
@ -25,7 +25,7 @@ outlines_core == 0.2.10
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata; python_version < '3.10'
-mistral_common[opencv] >= 1.6.2
+mistral_common[opencv] >= 1.8.0
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@ -47,3 +47,4 @@ python-json-logger # Used by logging as per examples/others/logging_configuratio
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
+cbor2 # Required for cross-language serialization of hashable objects
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -24,6 +24,4 @@ datasets # for benchmark scripts
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-py-libnuma; platform_system != "Darwin"
-psutil; platform_system != "Darwin"
 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.0
-torchaudio==2.7.0
+torch==2.7.1
+torchaudio==2.7.1
 # These must be updated alongside torch
-torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
-xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
+xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -4,6 +4,7 @@ mkdocs-material
 mkdocstrings-python
 mkdocs-gen-files
 mkdocs-awesome-nav
+mkdocs-glightbox
 python-markdown-math
 regex
 ruff
@ -11,10 +12,12 @@ ruff
 # Required for argparse hook only
 -f https://download.pytorch.org/whl/cpu
 cachetools
+cbor2
 cloudpickle
 fastapi
 msgspec
 openai
+partial-json-parser
 pillow
 psutil
 pybase64
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@ -1,12 +0,0 @@
-# Common dependencies
-r common.txt
-
-# Dependencies for HPU code
-ray
-triton==3.1.0
-pandas
-numpy==1.26.4
-tabulate
-setuptools>=77.0.3,<80.0.0
-setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.6.2 # required for pixtral test
+mistral_common[opencv] >= 1.8.0 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
--- a/requirements/test.in
+++ b/requirements/test.in
@ -22,14 +22,15 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.7.0
-torchaudio==2.7.0
-torchvision==0.22.0
+torch==2.7.1
+torchaudio==2.7.1
+torchvision==0.22.1
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.6.2 # required for pixtral test
+mistral_common[opencv] >= 1.8.0 # required for voxtral test
 num2words # required for smolvlm test
+open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -174,6 +174,8 @@ fsspec==2024.9.0
    #   fastparquet
    #   huggingface-hub
    #   torch
+ftfy==6.3.1
+    # via open-clip-torch
 genai-perf==0.0.8
    # via -r requirements/test.in
 genson==1.3.0
@ -208,6 +210,7 @@ huggingface-hub==0.33.0
    #   accelerate
    #   datasets
    #   evaluate
+    #   open-clip-torch
    #   peft
    #   sentence-transformers
    #   timm
@ -305,7 +308,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.6.2
+mistral-common==1.8.0
    # via -r requirements/test.in
 more-itertools==10.5.0
    # via lm-eval
@ -414,6 +417,8 @@ nvidia-nvjitlink-cu12==12.8.61
    #   torch
 nvidia-nvtx-cu12==12.8.55
    # via torch
+open-clip-torch==2.32.0
+    # via -r requirements/test.in
 opencensus==0.11.4
    # via ray
 opencensus-context==0.1.3
@ -518,6 +523,8 @@ pyasn1-modules==0.4.2
    # via google-auth
 pybind11==2.13.6
    # via lm-eval
+pycountry==24.6.1
+    # via pydantic-extra-types
 pycparser==2.22
    # via cffi
 pycryptodomex==3.22.0
@ -528,9 +535,12 @@ pydantic==2.11.5
    #   datamodel-code-generator
    #   mistral-common
    #   mteb
+    #   pydantic-extra-types
    #   ray
 pydantic-core==2.33.2
    # via pydantic
+pydantic-extra-types==2.10.5
+    # via mistral-common
 pygments==2.18.0
    # via rich
 pyparsing==3.2.0
@ -610,6 +620,7 @@ referencing==0.35.1
 regex==2024.9.11
    # via
    #   nltk
+    #   open-clip-torch
    #   sacrebleu
    #   tiktoken
    #   transformers
@ -660,6 +671,7 @@ sacrebleu==2.4.3
 safetensors==0.4.5
    # via
    #   accelerate
+    #   open-clip-torch
    #   peft
    #   timm
    #   transformers
@ -748,7 +760,9 @@ tiktoken==0.7.0
    #   lm-eval
    #   mistral-common
 timm==1.0.11
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   open-clip-torch
 tokenizers==0.21.1
    # via
    #   -r requirements/test.in
@ -757,7 +771,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.7.0+cu128
+torch==2.7.1+cu128
    # via
    #   -r requirements/test.in
    #   accelerate
@ -767,6 +781,7 @@ torch==2.7.0+cu128
    #   lm-eval
    #   mamba-ssm
    #   mteb
+    #   open-clip-torch
    #   peft
    #   runai-model-streamer
    #   sentence-transformers
@ -776,14 +791,15 @@ torch==2.7.0+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.7.0+cu128
+torchaudio==2.7.1+cu128
    # via
    #   -r requirements/test.in
    #   encodec
    #   vocos
-torchvision==0.22.0+cu128
+torchvision==0.22.1+cu128
    # via
    #   -r requirements/test.in
+    #   open-clip-torch
    #   timm
 tqdm==4.66.6
    # via
@ -793,6 +809,7 @@ tqdm==4.66.6
    #   lm-eval
    #   mteb
    #   nltk
+    #   open-clip-torch
    #   peft
    #   pqdm
    #   sentence-transformers
@ -811,7 +828,7 @@ transformers==4.53.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.3.0
+triton==3.3.1
    # via torch
 tritonclient==2.51.0
    # via
@ -835,6 +852,7 @@ typing-extensions==4.12.2
    #   pqdm
    #   pydantic
    #   pydantic-core
+    #   pydantic-extra-types
    #   torch
    #   typer
    #   typing-inspection
@ -857,6 +875,8 @@ virtualenv==20.31.2
    # via ray
 vocos==0.1.0
    # via -r requirements/test.in
+wcwidth==0.2.13
+    # via ftfy
 webcolors==24.11.1
    # via jsonschema
 werkzeug==3.1.3
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -18,9 +18,8 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250703
-torchvision==0.24.0.dev20250703
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.9.0.dev20250716
+torchvision==0.24.0.dev20250716
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"

--- a/setup.py
+++ b/setup.py
@ -410,29 +410,6 @@ class repackage_wheel(build_ext):
                package_data[package_name].append(file_name)


-def _is_hpu() -> bool:
-    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
-    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
-        return VLLM_TARGET_DEVICE == "hpu"
-
-    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
-    # and if it doesn't, check if habanalabs driver is loaded
-    is_hpu_available = False
-    try:
-        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
-        is_hpu_available = out.returncode == 0
-    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if sys.platform.startswith("linux"):
-            try:
-                output = subprocess.check_output(
-                    'lsmod | grep habanalabs | wc -l', shell=True)
-                is_hpu_available = int(output) > 0
-            except (ValueError, FileNotFoundError, PermissionError,
-                    subprocess.CalledProcessError):
-                pass
-    return is_hpu_available
-
-
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"

@ -440,7 +417,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu() or _is_hpu()))
+            and not (_is_neuron() or _is_tpu()))


 def _is_hip() -> bool:
@ -573,12 +550,6 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"{sep}neuron{neuron_version_str}"
-    elif _is_hpu():
-        # Get the Intel Gaudi Software Suite version
-        gaudi_sw_version = str(get_gaudi_sw_version())
-        if gaudi_sw_version != MAIN_CUDA_VERSION:
-            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
-            version += f"{sep}gaudi{gaudi_sw_version}"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@ -625,8 +596,6 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("neuron.txt")
-    elif _is_hpu():
-        requirements = _read_requirements("hpu.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@ -635,8 +604,7 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
-            "or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
    return requirements


@ -692,7 +660,8 @@ setup(
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
-        "audio": ["librosa", "soundfile"],  # Required for audio processing
+        "audio": ["librosa", "soundfile",
+                  "mistral_common[audio]"],  # Required for audio processing
        "video": []  # Kept for backwards compatibility
    },
    cmdclass=cmdclass,
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
-def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
+def api_server(distributed_executor_backend: str):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@ -40,8 +40,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
        "facebook/opt-125m",
        "--host",
        "127.0.0.1",
-        "--tokenizer-pool-size",
-        str(tokenizer_pool_size),
        "--distributed-executor-backend",
        distributed_executor_backend,
    ]
@ -54,10 +52,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
    uvicorn_process.terminate()


-@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, tokenizer_pool_size: int,
-                    distributed_executor_backend: str):
+def test_api_server(api_server, distributed_executor_backend: str):
    """
    Run the API server and test it.

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
    assert not vllm_config.compilation_config.use_cudagraph


+# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
+# on the state of the cache directory on the current machine, which
+# may be influenced by other tests.
+@pytest.mark.parametrize("val", ["1"])
+def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
+    assert vllm.envs.VLLM_USE_V1
+
+    # spawn means that the counters are in the same process.
+    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
+    monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
+
+    compilation_config = {
+        "use_cudagraph": False,  # speed things up a bit
+    }
+    with (
+            compilation_counter.expect(num_cache_entries_updated=0,
+                                       num_compiled_artifacts_saved=0),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config=compilation_config,
+                        gpu_memory_utilization=0.4) as _):
+        pass
+
+
@pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
    assert vllm.envs.VLLM_USE_V1
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@ -132,9 +132,7 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
                                           dtype=dtype,
                                           seed=42)

-    all_reduce_fusion_pass = AllReduceFusionPass(
-        vllm_config, vllm_config.compilation_config.pass_config.
-        fi_allreduce_fusion_max_token_num)
+    all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
    backend = TestBackend(all_reduce_fusion_pass)

    model = test_model_cls(hidden_size)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -804,7 +804,7 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor]],
+        prompts: Union[list[str], list[torch.Tensor], list[int]],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
@ -826,11 +826,16 @@ class VllmRunner:
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio

-            text_prompt_kwargs = {
-                ("prompt" if isinstance(prompt, str) else "prompt_embeds"):
-                prompt,
+            text_prompt_kwargs: dict[str, Any] = {
                "multi_modal_data": multi_modal_data or None
            }
+            if isinstance(prompt, str):
+                text_prompt_kwargs["prompt"] = prompt
+            elif isinstance(prompt, list):
+                text_prompt_kwargs["prompt_token_ids"] = prompt
+            else:
+                text_prompt_kwargs["prompt_embeds"] = prompt
+
            inputs.append(TextPrompt(**text_prompt_kwargs))

        return inputs
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@ -6,7 +6,7 @@ import msgspec
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.sequence import ExecuteModelRequest

-from ..spec_decode.utils import create_batch
+from .utils import create_batch


 def test_msgspec_serialization():
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@ -4,15 +4,16 @@
 import time
 from collections import defaultdict
 from collections.abc import Sequence as GenericSequence
-from typing import Any, Optional
+from itertools import count
+from typing import Any, Optional, Union

 import torch

-from vllm import SamplingParams
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.sequence import (Logprob, Sequence, SequenceGroup,
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup,
                           SequenceGroupMetadata)


@ -262,3 +263,130 @@ class SchedulerProxy:
        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
        _, _, ret = self.call_history["schedule"][-1]
        return ret
+
+
+def create_seq_group_metadata_from_prompts(
+    prompts: list[list[int]],
+    num_gpu_blocks: int,
+    block_size: int,
+    final_prompt_lens: list[int],
+    continuations: Optional[list[list[int]]] = None,
+    seq_ids: Optional[list[int]] = None,
+) -> list[SequenceGroupMetadata]:
+
+    if continuations is None:
+        continuations = [[] for _ in prompts]
+
+    if seq_ids is None:
+        seq_ids = list(i for i, _ in enumerate(prompts))
+
+    free_gpu_blocks = list(range(num_gpu_blocks))
+
+    block_allocations = {
+        i: [
+            free_gpu_blocks.pop()
+            for _ in range(round_up_to_next_block(final_len, block_size))
+        ]
+        for i, final_len in enumerate(final_prompt_lens)
+    }
+
+    seq_grou_metadata_list = []
+    for i, (prompt_token_ids,
+            cont_token_ids) in enumerate(zip(prompts, continuations)):
+        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
+        data.update_num_computed_tokens(
+            len(prompt_token_ids) + len(cont_token_ids) - 1)
+        seq_data = {i: data}
+        seq_grou_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(i),
+                is_prompt=len(cont_token_ids) == 0,
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations[i][:]},
+            ))
+    return seq_grou_metadata_list
+
+
+def create_chunked_seq_group_metadata_from_prompt(
+        prompt: list[int],
+        num_gpu_blocks: int,
+        chunk_size: int,
+        block_size: int,
+        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
+
+    if seq_id is None:
+        seq_id = 0
+
+    free_gpu_blocks = list(range(num_gpu_blocks))
+
+    block_allocations = [
+        free_gpu_blocks.pop()
+        for _ in range(round_up_to_next_block(len(prompt), block_size))
+    ]
+
+    seq_group_metadata_list = []
+    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
+        chunk_ids = prompt[idx:idx + chunk_size]
+        data = SequenceData.from_seqs(prompt)
+        data.update_num_computed_tokens(idx)
+        seq_data = {i: data}
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(seq_id),
+                is_prompt=True,
+                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations},
+                token_chunk_size=len(chunk_ids)))
+    return seq_group_metadata_list
+
+
+def create_batch(batch_size,
+                 k,
+                 prompt_len: Union[int, list[int]] = 10,
+                 prev_output_token_len: int = 10,
+                 seq_ids: Optional[list[int]] = None,
+                 num_gpu_blocks: Optional[int] = None,
+                 block_size: Optional[int] = None,
+                 prefill_chunk_size: Optional[int] = None):
+    if block_size is None:
+        block_size = 8
+
+    if num_gpu_blocks is None:
+        num_gpu_blocks = 2048 // block_size
+
+    iterator = count()
+
+    if isinstance(prompt_len, int):
+        prompt_lens = [prompt_len for _ in range(batch_size)]
+    else:
+        prompt_lens = prompt_len
+
+    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+
+    if prefill_chunk_size:
+        # Create a batch of chunked prompts.
+        if not seq_ids:
+            seq_ids = list(range(len(prompts)))
+        seq_group_metadata_list = []
+        for p, sid in zip(prompts, seq_ids):
+            seq_group_metadata_list += \
+                create_chunked_seq_group_metadata_from_prompt(
+                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
+        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
+        prev_output_tokens = []
+    else:
+        prev_output_tokens = [[
+            next(iterator) for _ in range(prev_output_token_len)
+        ] for _ in range(batch_size)]
+        final_prompt_lens = [
+            len(prompt) + len(prev_output_token) + k + 1
+            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+        ]
+
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size, final_prompt_lens,
+            prev_output_tokens, seq_ids)
+    return seq_group_metadata_list, prompts, prev_output_tokens
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -14,8 +14,9 @@ from typing import Literal, NamedTuple, Optional

 import pytest

-from vllm.config import TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config

 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, create_new_process_for_each_test
@ -158,7 +159,7 @@ TEXT_GENERATION_MODELS = {
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
@ -210,9 +211,11 @@ TEXT_GENERATION_MODELS = {

 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
+        load_format="dummy", task="embed"
+    ),
 }

 MULTIMODAL_MODELS = {
@ -248,6 +251,7 @@ TEST_MODELS = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "ArthurZ/Ilama-3.2-1B",
    "ibm/PowerLM-3b",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
@ -287,6 +291,11 @@ def _compare_tp(
    trust_remote_code = model_info.trust_remote_code
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
+    hf_config = get_config(model_id, trust_remote_code)
+
+    dtype = "float16"
+    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
+        dtype = "bfloat16"

    if load_format == "dummy":
        # Avoid OOM
@ -316,7 +325,7 @@ def _compare_tp(
    common_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "float16",
+        dtype,
        "--max-model-len",
        "2048",
        "--max-num-seqs",
@ -338,6 +347,7 @@ def _compare_tp(
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    testing_ray_compiled_graph = False
    if distributed_backend == "ray" and (vllm_major_version == "1"
                                         or specific_case):
        # For V1, test Ray Compiled Graph for all the tests
@ -351,6 +361,7 @@ def _compare_tp(
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of a Ray Compiled Graph issue.
        common_args.append("--disable-frontend-multiprocessing")
+        testing_ray_compiled_graph = True
    elif distributed_backend == "mp":
        # Both V0/V1 of multiprocessing executor support PP
        pp_env = {
@ -394,7 +405,6 @@ def _compare_tp(
                             tp_env,
                             method=method)
    except Exception:
-        testing_ray_compiled_graph = pp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
            # Ray Compiled Graph tests are flaky for V0,
            # so we don't want to fail the test
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
-from argparse import ArgumentError, ArgumentTypeError
+from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Annotated, Literal, Optional
@ -12,8 +12,8 @@ import pytest
 from vllm.config import CompilationConfig, config
 from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
                                   get_type, get_type_hints, is_not_builtin,
-                                   is_type, literal_to_kwargs, nullable_kvs,
-                                   optional_type, parse_type)
+                                   is_type, literal_to_kwargs, optional_type,
+                                   parse_type)
 from vllm.utils import FlexibleArgumentParser


@ -25,18 +25,10 @@ from vllm.utils import FlexibleArgumentParser
        "foo": 1,
        "bar": 2
    }),
-    (json.loads, "foo=1,bar=2", {
-        "foo": 1,
-        "bar": 2
-    }),
 ])
 def test_parse_type(type, value, expected):
    parse_type_func = parse_type(type)
-    context = nullcontext()
-    if value == "foo=1,bar=2":
-        context = pytest.warns(DeprecationWarning)
-    with context:
-        assert parse_type_func(value) == expected
+    assert parse_type_func(value) == expected


 def test_optional_type():
@ -203,34 +195,6 @@ def test_get_kwargs():
    assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4


-@pytest.mark.parametrize(("arg", "expected"), [
-    (None, dict()),
-    ("image=16", {
-        "image": 16
-    }),
-    ("image=16,video=2", {
-        "image": 16,
-        "video": 2
-    }),
-    ("Image=16, Video=2", {
-        "image": 16,
-        "video": 2
-    }),
-])
-def test_limit_mm_per_prompt_parser(arg, expected):
-    """This functionality is deprecated and will be removed in the future.
-    This argument should be passed as JSON string instead.
-    
-    TODO: Remove with nullable_kvs."""
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    if arg is None:
-        args = parser.parse_args([])
-    else:
-        args = parser.parse_args(["--limit-mm-per-prompt", arg])
-
-    assert args.limit_mm_per_prompt == expected
-
-
@pytest.mark.parametrize(
    ("arg", "expected"),
    [
@ -326,18 +290,6 @@ def test_prefix_cache_default():
    assert not engine_args.enable_prefix_caching


-@pytest.mark.parametrize(
-    ("arg"),
-    [
-        "image",  # Missing =
-        "image=4,image=5",  # Conflicting values
-        "image=video=4"  # Too many = in tokenized arg
-    ])
-def test_bad_nullable_kvs(arg):
-    with pytest.raises(ArgumentTypeError):
-        nullable_kvs(arg)
-
-
 # yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
    (None, None, "mm-processor-kwargs"),
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -71,9 +71,8 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
            # Limit compilation time for TPU V1

            if model == "google/gemma-3-1b-it":
-                pytest.skip(
-                    "Temporarily disabled due to test failures"
-                    "(timeout or accuracy mismatch). Re-enable once fixed.")
+                # TPU + google/gemma-3-1b-it + xet doesn't work well.
+                m.setenv("HF_HUB_DISABLE_XET", "1")

            more_args = "max_model_len=2048,max_num_seqs=64"

--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@ -176,4 +176,8 @@ async def test_invocations(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert classification_output.keys() == invocation_output.keys()
-    assert classification_output["data"] == invocation_output["data"]
+    for classification_data, invocation_data in zip(
+            classification_output["data"], invocation_output["data"]):
+        assert classification_data.keys() == invocation_data.keys()
+        assert classification_data["probs"] == pytest.approx(
+            invocation_data["probs"], rel=0.01)
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@ -153,3 +153,13 @@ def test_chat_template_validation_for_sad_paths(serve_parser):
    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
    with pytest.raises(ValueError):
        validate_parsed_serve_args(args)
+
+
+@pytest.mark.parametrize(
+    "cli_args, expected_middleware",
+    [(["--middleware", "middleware1", "--middleware", "middleware2"
+       ], ["middleware1", "middleware2"]), ([], [])])
+def test_middleware(serve_parser, cli_args, expected_middleware):
+    """Ensure multiple middleware args are parsed properly"""
+    args = serve_parser.parse_args(args=cli_args)
+    assert args.middleware == expected_middleware
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@ -14,6 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...models.language.pooling.embed_utils import (
    run_embedding_correctness_test)
+from ...models.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "intfloat/multilingual-e5-small"
@ -321,7 +322,13 @@ async def test_invocations(server: RemoteOpenAIServer,
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
-    assert completion_output["data"] == invocation_output["data"]
+    for completion_data, invocation_data in zip(completion_output["data"],
+                                                invocation_output["data"]):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
+                               embeddings_1_lst=[invocation_data["embedding"]],
+                               name_0="completion",
+                               name_1="invocation")


@pytest.mark.asyncio
@ -355,4 +362,10 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
-    assert chat_output["data"] == invocation_output["data"]
+    for chat_data, invocation_data in zip(chat_output["data"],
+                                          invocation_output["data"]):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
+                               embeddings_1_lst=[invocation_data["embedding"]],
+                               name_0="chat",
+                               name_1="invocation")
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
 from typing import Final

 import pytest
@ -29,7 +30,7 @@ def server():
        "--enforce-eager",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        json.dumps({"image": MAXIMUM_IMAGES}),
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@ -281,7 +281,13 @@ async def test_invocations(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert completion_output.keys() == invocation_output.keys()
-    assert completion_output["data"] == invocation_output["data"]
+    for completion_data, invocation_data in zip(completion_output["data"],
+                                                invocation_output["data"]):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=completion_data["data"],
+                               embeddings_1_lst=invocation_data["data"],
+                               name_0="completion",
+                               name_1="invocation")


@pytest.mark.asyncio
@ -314,4 +320,10 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
    invocation_output = invocation_response.json()

    assert chat_output.keys() == invocation_output.keys()
-    assert chat_output["data"] == invocation_output["data"]
+    for chat_data, invocation_data in zip(chat_output["data"],
+                                          invocation_output["data"]):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(embeddings_0_lst=chat_data["data"],
+                               embeddings_1_lst=invocation_data["data"],
+                               name_0="chat",
+                               name_1="invocation")
--- a/Show More
+++ b/Show More