updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Upgrade FlashInfer to v0.3.0 (#24086 )
2025-09-08 21:06:29 +00:00 · 2025-09-04 09:49:20 -07:00 · 2025-09-04 16:06:51 +00:00 · 2025-09-04 08:52:17 -07:00 · 2025-09-04 22:55:23 +08:00 · 2025-09-04 09:25:40 -04:00
744 changed files with 28700 additions and 10804 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.

-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`

 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -218,7 +218,7 @@ if __name__ == "__main__":
        "--xaxis",
        type=str,
        default="# of max concurrency.",
-        help="column name to use as X Axis in comparision graph",
+        help="column name to use as X Axis in comparison graph",
    )
    args = parser.parse_args()

--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,21 +1,24 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.8"
-    id: build-wheel-arm64-cuda-12-8
+  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build arm64 wheel - CUDA 12.9"
+    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  # x86 + CUDA builds
+  - block: "Build CUDA 12.8 wheel"
+    key: block-build-cu128-wheel
+
  - label: "Build wheel - CUDA 12.8"
+    depends_on: block-build-cu128-wheel
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
@ -44,44 +47,63 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  # However, this block can be uncommented to save some compute hours.
-  # - block: "Build CUDA 11.8 wheel"
-  #   key: block-build-cu118-wheel
-
-  - label: "Build wheel - CUDA 11.8"
-    # depends_on: block-build-cu118-wheel
-    id: build-wheel-cuda-11-8
+  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-cuda-12-9
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build release image"
+  - label: "Build release image (x86)"
    depends_on: ~
-    key: block-release-image-build
-
-  - label: "Build release image"
-    depends_on: block-release-image-build
-    id: build-release-image
+    id: build-release-image-x86
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

+  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build release image (arm64)"
+    depends_on: ~
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  # Add job to create multi-arch manifest
+  - label: "Create multi-arch manifest"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
  - label: "Annotate release workflow"
    depends_on:
-      - build-release-image
+      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
      - build-wheel-cuda-12-6
-      - build-wheel-cuda-11-8
+      - build-wheel-cuda-12-9
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
@ -49,23 +49,23 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

    # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py

-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@ -73,33 +73,49 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"

-  # online serving
+  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
-      --endpoint /v1/completions'
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -30,9 +30,11 @@ docker run \
    bash -c '
    set -e
    echo $ZE_AFFINITY_MASK
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -74,14 +75,15 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -109,10 +109,9 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Test (API Server) # 40min
@ -234,7 +233,26 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: V1 Test
+- label: V1 Test e2e + engine
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: V1 Test others
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -242,8 +260,6 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/executor
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
@ -256,9 +272,6 @@ steps:
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -312,7 +325,7 @@ steps:
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

 - label: PyTorch Compilation Unit Tests
@ -390,6 +403,7 @@ steps:
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@ -448,8 +462,8 @@ steps:
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
@ -552,8 +566,7 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing

 - label: Multi-Modal Models Test (Standard)
  mirror_hardwares: [amdexperimental]
@ -653,7 +666,9 @@ steps:
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
@ -661,6 +676,7 @@ steps:
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -753,6 +769,11 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y 
+  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -789,13 +810,14 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_multi_loras_with_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py


 - label: Weight Loading Multiple GPU Test  # 33min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_gpus: 2 
+  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -79,4 +79,10 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep

+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras

--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT

 ## Test Result

-## (Optional) Documentation Update
-
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>

 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -0,0 +1,21 @@
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     # min_available defaults to the global cfg in the ALI Terraform
+#     min_available: undefined
+#     # when max_available value is not defined, no max runners is enforced
+#     max_available: undefined
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: true
+    os: linux
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -0,0 +1,309 @@
+name: Label issues based on keywords
+on:
+  issues:
+    types: [opened, edited, reopened]
+permissions:
+  issues: write          # needed so the workflow can add labels
+  contents: read
+concurrency:
+  group: issue-labeler-${{ github.event.issue.number }}
+  cancel-in-progress: true
+jobs:
+  add-labels:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Label issues based on keywords
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        with:
+          script: |
+            // Configuration: Add new labels and keywords here
+            const labelConfig = {
+              rocm: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "composable kernel",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rccl",
+                    searchIn: "body"  // only search in body
+                  },
+                  {
+                    term: "migraphx",
+                    searchIn: "title"  // only search in title
+                  },
+                  {
+                    term: "hipgraph",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "ROCm System Management Interface",
+                    searchIn: "body"
+                  },
+                ],
+                
+                // Substring search - matches anywhere in text (partial matches)
+                substrings: [
+                  {
+                    term: "VLLM_ROCM_",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "aiter",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "rocm",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "amd",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "hip-",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "gfx",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "cdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "torch_hip",
+                    searchIn: "body"  // only in body
+                  },
+                  {
+                    term: "_hip",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "hip_",
+                    searchIn: "both"
+                  },
+                  
+                  // ROCm tools and libraries
+                  {
+                    term: "hipify",
+                    searchIn: "both"
+                  },
+                ],
+                
+                // Regex patterns - for complex pattern matching
+                regexPatterns: [
+                  {
+                    pattern: "\\bmi\\d{3}[a-z]*\\b",
+                    description: "AMD GPU names (mi + 3 digits + optional letters)",
+                    flags: "gi",
+                    searchIn: "both"  // "title", "body", or "both"
+                  }
+                ],
+              },
+            };
+            
+            // Helper function to create regex based on search type
+            function createSearchRegex(term, type) {
+              // Escape special regex characters in the term
+              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+              
+              switch (type) {
+                case 'keyword':
+                  // Word boundary search - matches whole words only
+                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+                case 'substring':
+                  // Substring search - matches anywhere in the text
+                  return new RegExp(escapedTerm, "gi");
+                default:
+                  throw new Error(`Unknown search type: ${type}`);
+              }
+            }
+            
+            // Helper function to find matching terms in text with line information
+            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+              const matches = [];
+              const lines = text.split('\n');
+              
+              for (const termConfig of searchTerms) {
+                let regex;
+                let term, searchIn, pattern, description, flags;
+                
+                // Handle different input formats (string or object)
+                if (typeof termConfig === 'string') {
+                  term = termConfig;
+                  searchIn = 'both'; // default
+                } else {
+                  term = termConfig.term;
+                  searchIn = termConfig.searchIn || 'both';
+                  pattern = termConfig.pattern;
+                  description = termConfig.description;
+                  flags = termConfig.flags;
+                }
+                
+                // Skip if this term shouldn't be searched in the current location
+                if (searchIn !== 'both' && searchIn !== searchLocation) {
+                  continue;
+                }
+                
+                // Create appropriate regex
+                if (searchType === 'regex') {
+                  regex = new RegExp(pattern, flags || "gi");
+                } else {
+                  regex = createSearchRegex(term, searchType);
+                }
+                
+                const termMatches = [];
+                
+                // Check each line for matches
+                lines.forEach((line, lineIndex) => {
+                  const lineMatches = line.match(regex);
+                  if (lineMatches) {
+                    lineMatches.forEach(match => {
+                      termMatches.push({
+                        match: match,
+                        lineNumber: lineIndex + 1,
+                        lineContent: line.trim(),
+                        searchType: searchType,
+                        searchLocation: searchLocation,
+                        originalTerm: term || pattern,
+                        description: description,
+                        // Show context around the match in the line
+                        context: line.length > 100 ? 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                          : line.trim()
+                      });
+                    });
+                  }
+                });
+                
+                if (termMatches.length > 0) {
+                  matches.push({
+                    term: term || (description || pattern),
+                    searchType: searchType,
+                    searchLocation: searchLocation,
+                    searchIn: searchIn,
+                    pattern: pattern,
+                    matches: termMatches,
+                    count: termMatches.length
+                  });
+                }
+              }
+              
+              return matches;
+            }
+            
+            // Helper function to check if label should be added
+            async function processLabel(labelName, config) {
+              const body = context.payload.issue.body || "";
+              const title = context.payload.issue.title || "";
+              
+              core.notice(`Processing label: ${labelName}`);
+              core.notice(`Issue Title: "${title}"`);
+              core.notice(`Issue Body length: ${body.length} characters`);
+              
+              let shouldAddLabel = false;
+              let allMatches = [];
+              let reason = '';
+              
+              const keywords = config.keywords || [];
+              const substrings = config.substrings || [];
+              const regexPatterns = config.regexPatterns || [];
+              
+              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+              
+              // Search in title
+              if (title.trim()) {
+                core.notice(`Searching in title: "${title}"`);
+                
+                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+                
+                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+              }
+              
+              // Search in body
+              if (body.trim()) {
+                core.notice(`Searching in body (${body.length} characters)`);
+                
+                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+                
+                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+              }
+              
+              if (allMatches.length > 0) {
+                core.notice(`Found ${allMatches.length} matching term(s):`);
+                
+                for (const termMatch of allMatches) {
+                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+                  
+                  if (termMatch.searchType === 'regex') {
+                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  } else {
+                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  }
+                  
+                  // Show details for each match
+                  termMatch.matches.forEach((match, index) => {
+                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+                    if (match.description) {
+                      core.notice(`       Description: ${match.description}`);
+                    }
+                    core.notice(`       Context: ${match.context}`);
+                    if (match.lineContent !== match.context) {
+                      core.notice(`       Full line: ${match.lineContent}`);
+                    }
+                  });
+                }
+                
+                shouldAddLabel = true;
+                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+                
+                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+              }
+              
+              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+              core.notice(`Reason: ${reason || 'No matching terms found'}`);
+              
+              if (shouldAddLabel) {
+                const existingLabels = context.payload.issue.labels.map(l => l.name);
+                if (!existingLabels.includes(labelName)) {
+                  await github.rest.issues.addLabels({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    labels: [labelName],
+                  });
+                  core.notice(`Label "${labelName}" added. ${reason}`);
+                  return true;
+                }
+                core.notice(`Label "${labelName}" already present.`);
+                return false;
+              }
+              
+              core.notice(`No matching terms found for label "${labelName}".`);
+              return false;
+            }
+            
+            // Process all configured labels
+            const processLabels = Object.entries(labelConfig)
+              .map(([labelName, config]) => processLabel(labelName, config));
+            const labelsAdded = await Promise.all(processLabels);
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.35.5
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

 #
 # Try to find python package with an executable that exactly matches
@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@ -817,7 +819,9 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+  list(APPEND VLLM_MOE_EXT_SRC
+    "csrc/moe/moe_wna16.cu"
+    "csrc/moe/grouped_topk_kernels.cu")
 endif()

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/README.md
+++ b/README.md
@ -18,7 +18,9 @@ Easy, fast, and cheap LLM serving for everyone

 *Latest News* 🔥

+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma

 * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.

+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+    * Substantial internal deployment leveraging the upstream vLLM project.
+    * Established internal security teams and comprehensive compliance measures.
+    * Active and consistent contributions to the upstream vLLM project.
+
 * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -110,7 +110,12 @@ become available.

 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```

 ## 🚀 Example - Online Benchmark

@ -749,7 +754,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct \

 Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.

-Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens:
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:

 ```bash
 vllm bench serve \
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -31,6 +31,12 @@ cd vllm

 You must set the following variables at the top of the script before execution.

+   Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -5,25 +5,41 @@

 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"

-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="

 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@ -213,7 +229,7 @@ run_benchmark() {

    pkill -if vllm
    sleep 10
-    printf '=%.0s' $(seq 1 20)
+    echo "===================="
    return 0
 }

--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -57,7 +57,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=1000,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
    )
    parser.add_argument(
        "--allocate-blocks",
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
            # [6880, 6881] -> ['Ġcalls', 'here'] ->
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
+            # the encoded sequence is truncated before being decoded again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                :total_input_len
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -77,7 +77,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=100,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
    )
    parser.add_argument(
        "--num-req", type=int, default=128, help="Number of requests in the batch"
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1104,7 +1104,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -998,7 +998,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -96,7 +96,6 @@ def run_vllm(
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
-        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
        output_len = requests[0].expected_output_len
        for request in requests:
@ -720,7 +719,7 @@ def create_argument_parser():
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )

-    # hf dtaset
+    # hf dataset
    parser.add_argument(
        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
    )
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -62,7 +62,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


  CUDA_VISIBLE_DEVICES=1 python3 \
@ -72,7 +72,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -69,7 +69,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -78,7 +78,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+    (512 + 64, 7168),
+    (2112, 7168),
+    ((128 + 64) * 128, 7168),
+    (128 * (128 + 128), 512),
+    (7168, 16384),
+    (7168, 18432),
+    (18432 * 2, 7168),
+    (24576, 1536),
+    (12288, 7168),
+    (4096, 7168),
+    (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+    """Build runner function for w8a8 block fp8 matmul."""
+    factor_for_scale = 1e-2
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    # Create random FP8 tensors
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Create scales
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+        * factor_for_scale
+    )
+
+    def run():
+        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+    return run
+
+
+@vllm_triton.testing.perf_report(
+    vllm_triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_names=["torch-bf16", "w8a8-block-fp8"],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+    M = batch_size
+    device = "cuda"
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:  # w8a8-block-fp8
+        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+    block_size = (128, 128)
+
+    for N, K in DEEPSEEK_V3_SHAPES:
+        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+        print(f"TFLOP/s comparison (block_size={block_size}):")
+        benchmark_tflops.run(
+            print_data=True,
+            # show_plots=False,
+            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+            N=N,
+            K=K,
+            block_size=block_size,
+        )
+
+    print("\nBenchmark finished!")
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -637,7 +637,7 @@ def bench_optype(
    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
    torch.cuda.synchronize()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -419,8 +419,10 @@ class BenchmarkWorker:
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
+        block_n = block_quant_shape[0] if block_quant_shape else None
+        block_k = block_quant_shape[1] if block_quant_shape else None
        op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str
+            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
        )
        if op_config is None:
            config = get_default_config(
@ -430,6 +432,7 @@ class BenchmarkWorker:
                hidden_size,
                topk,
                dtype_str,
+                block_quant_shape,
            )
        else:
            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
    # cannot TP
    total = [
        (512 + 64, 7168),
+        (2112, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -962,7 +962,7 @@ async def main_mp(

    # At this point all the clients finished,
    # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happens before calling join on the clients
+    # This needs to happen before calling join on the clients
    # (result_queue should be emptied).
    while not result_queue.empty():
        client_metrics.append(result_queue.get())
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -1,6 +1,7 @@
 include(FetchContent)

 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@ -87,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)

 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
    message(STATUS "Apple Silicon Detected")
+    set(APPLE_SILICON_FOUND TRUE)
    set(ENABLE_NUMA OFF)
    check_sysctl(hw.optional.neon ASIMD_FOUND)
    check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@ -188,7 +190,7 @@ else()
    set(USE_ACL OFF)
 endif()

-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -64,11 +64,11 @@ struct IsPersistent {
  static const bool value = v;
 };

-template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
  using Element = T;
  using ElementAcc = float;
-  using ElementOut = T;
+  using ElementOut = TOut;

  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
  using TileShapeH = cute::tuple_element_t<0, TileShape>;
@ -178,7 +178,7 @@ typename T::Fmha::Arguments args_from_options(
  return arguments;
 }

-template <typename Element, bool IsPaged128, typename PersistenceOption>
+template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
    at::Tensor const& out,
    at::Tensor const& q_nope,
@ -190,7 +190,7 @@ void runMla(
    double sm_scale,
    int64_t num_kv_splits,
    cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
  typename MlaSm100Type::Fmha fmha;
  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);

@ -233,13 +233,13 @@ void sm100_cutlass_mla_decode(
  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
      if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else {
        TORCH_CHECK(false, "Unsupported input data type of MLA");
@ -253,7 +253,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
  // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;

  // Get split kv. Requires problem shape and sm_count only.
  typename MlaSm100Type::Fmha::Arguments arguments;
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

+void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                                   torch::Tensor& cp_local_token_select_indices,
+                                   torch::Tensor& kv_cache,
+                                   torch::Tensor& slot_mapping,
+                                   const std::string& kv_cache_dtype,
+                                   torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache(
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt);
+    std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,6 +1,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>

 #include "cuda_utils.h"
 #include "cuda_compat.h"
@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void cp_fused_concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
+    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
+  const int64_t slot_idx = slot_mapping[blockIdx.x];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -508,6 +554,20 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
+  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          cp_local_token_select_indices.data_ptr<int64_t>(),            \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -546,6 +606,50 @@ void concat_and_cache_mla(
                             CALL_CONCAT_AND_CACHE_MLA);
 }

+// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
+// calls into one:
+// k_c_normed.index_select(0, cp_local_token_select_indices) + \
+// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
+// concat_and_cache_mla.
+void cp_fused_concat_and_cache_mla(
+    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
+    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {

 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@ -779,3 +883,145 @@ void gather_and_maybe_dequant_cache(

  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
+
+namespace vllm {
+template <typename scalar_t>
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRY_SIZE]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
+                                            // batch
+) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on it
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  if (seq_starts != nullptr) {
+    offset += seq_starts[bid];
+  }
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * dst_entry_stride;
+    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+    offset += 1;
+    // bump to next block
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
+  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting slot index by
+//  seq_starts[bid]
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_CP_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_CP_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_CP_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
  delete ptr;
 }

+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
+
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
 public:
@ -166,6 +183,23 @@ struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
           hash<int>()(static_cast<int>(val.bias_type));
  }
 };
+
+template <>
+struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
+  size_t operator()(
+      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+  }
+};
+
+template <>
+struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
+  size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.a_m_size) ^
+           hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
+           hash<int>()(static_cast<int>(val.bias_type));
+  }
+};
 }  // namespace std

 bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
         l.bias_type == r.bias_type;
 }

+bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+}
+
+bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
+                const MatMulPrimitiveHandler::MSizeCacheKey& r) {
+  return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
+         l.use_bias == r.use_bias && l.bias_type == r.bias_type;
+}
+
 static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
 get_w8a8_class_primitive_cache(
    const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
  }

  dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
  matmul.execute(default_stream(), memory_cache_);
  default_stream().wait();
 }
@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(

  return m_size_cache_->get_or_create(key, [&]() {
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
    return dnnl::matmul(desc);
  });
 }
@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }

 dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                          dnnl::memory::format_tag::ab);

  dnnl::primitive_attr attr;
+
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
  // For PER_TOKEN, scales will be applied in outside epilogue
  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
    attr.set_scales_mask(DNNL_ARG_SRC, 0);
@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                                        attr);
  }
 }
+
+MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
+    : DNNLMatMulPrimitiveHandler(
+          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
+      m_size_cache_(nullptr) {
+  assert(ab_type_ == dnnl::memory::data_type::f32 ||
+         ab_type_ == dnnl::memory::data_type::bf16 ||
+         ab_type_ == dnnl::memory::data_type::f16);
+  prepack_weight(args.b_ptr,
+                 create_primitive_desc(
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
+                     true)
+                     .weights_desc());
+  init_runtime_memory_cache(args);
+}
+
+static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
+get_matul_class_primitive_cache(
+    const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+    int64_t cache_size) {
+  static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+  assert(cache_size > 0);
+  return cache.get_or_create(key, [&]() {
+    return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
+  });
+}
+
+void MatMulPrimitiveHandler::execute(ExecArgs& args) {
+  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+  a_storage->set_data_handle((void*)args.a_ptr);
+  a_mem_desc->dims[0] = args.a_m_size;
+  a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
+  c_storage->set_data_handle((void*)args.c_ptr);
+  c_mem_desc->dims[0] = args.a_m_size;
+
+  if (args.use_bias) {
+    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
+    bias_storage->set_data_handle((void*)args.bias_ptr);
+  }
+
+  dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
+  matmul.execute(default_stream(), memory_cache_);
+  default_stream().wait();
+}
+
+dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
+    const MSizeCacheKey& key) {
+  if (m_size_cache_.get() == nullptr) {
+    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+  }
+  return m_size_cache_->get_or_create(key, [&]() {
+    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
+    return dnnl::matmul(desc);
+  });
+}
+
+dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
+    const MSizeCacheKey& key, bool first_time) {
+  dnnl::memory::desc a_md;
+  dnnl::memory::desc b_md;
+  if (first_time) {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              dnnl::memory::format_tag::ab);
+    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+                              dnnl::memory::format_tag::any);
+  } else {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              {key.a_m_stride, 1});
+    b_md = b_target_mem_desc_;
+  }
+  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+                          dnnl::memory::format_tag::ab);
+
+  dnnl::primitive_attr attr;
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  if (key.use_bias) {
+    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+                                        c_md, attr);
+  } else {
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+                                        attr);
+  }
+}
+
+void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+  memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
+      {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
+  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+  memory_cache_[DNNL_ARG_DST] =
+      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+  memory_cache_[DNNL_ARG_BIAS] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
+}
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }

+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
 class DNNLMatMulPrimitiveHandler {
 public:
  virtual ~DNNLMatMulPrimitiveHandler() = default;
@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
  std::shared_ptr<MSizeCache> m_size_cache_;
 };

+class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+  struct Args : public DNNLMatMulPrimitiveHandler::Args {
+    dnnl::memory::data_type ab_type;
+  };
+
+  struct ClassMatmulCacheKey {
+    dnnl_dim_t b_n_size;
+    dnnl_dim_t b_k_size;
+
+    friend bool operator==(const ClassMatmulCacheKey& l,
+                           const ClassMatmulCacheKey& r);
+  };
+
+  struct MSizeCacheKey {
+    dnnl_dim_t a_m_size;
+    dnnl_dim_t a_m_stride;
+    bool use_bias;
+    dnnl::memory::data_type bias_type;
+
+    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+  };
+
+  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
+  using ClassMatmulCache =
+      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
+
+  struct ExecArgs : public MSizeCacheKey {
+    const void* a_ptr;
+    const void* bias_ptr;
+    void* c_ptr;
+  };
+
+ public:
+  MatMulPrimitiveHandler(const Args& args);
+
+  void execute(ExecArgs& args);
+
+ private:
+  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+                                                     bool first_time);
+
+  void init_runtime_memory_cache(const Args& args);
+
+  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+  std::shared_ptr<MSizeCache> m_size_cache_;
+};
+
 #endif
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -379,6 +379,7 @@ void onednn_scaled_mm(
  exec_args.a_ptr = a.data_ptr<int8_t>();
  exec_args.a_m_size = a.size(0);
  exec_args.bias_ptr = nullptr;
+  exec_args.bias_type = get_dnnl_type<void>();
  exec_args.use_bias = false;
  exec_args.a_scales_ptr = nullptr;
  exec_args.a_zero_points_ptr = nullptr;
@ -492,3 +493,56 @@ void dynamic_scaled_int8_quant(
        }
      });
 }
+
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size) {
+  TORCH_CHECK(b.dim() == 2);
+
+  MatMulPrimitiveHandler::Args args;
+  args.primitive_cache_size = primitive_cache_size;
+
+  args.b_k_size = b.size(0);
+  args.b_k_stride = b.stride(0);
+  args.b_n_size = b.size(1);
+  args.b_n_stride = b.stride(1);
+  args.b_ptr = b.data_ptr();
+
+  VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
+                               [&] {
+                                 args.c_type = get_dnnl_type<scalar_t>();
+                                 args.ab_type = get_dnnl_type<scalar_t>();
+                               });
+
+  return reinterpret_cast<int64_t>(new MatMulPrimitiveHandler(args));
+}
+
+void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
+               const torch::Tensor& a,  // [M, IC], row-major
+               const std::optional<torch::Tensor>& bias, int64_t handler) {
+  CPU_KERNEL_GUARD_IN(onednn_mm)
+  TORCH_CHECK(a.dim() == 2);
+  TORCH_CHECK(a.stride(-1) == 1);
+  TORCH_CHECK(c.is_contiguous());
+  MatMulPrimitiveHandler* ptr =
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+
+  MatMulPrimitiveHandler::ExecArgs exec_args;
+  exec_args.a_m_size = a.size(0);
+  exec_args.a_m_stride = a.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
+    if (bias.has_value()) {
+      exec_args.use_bias = true;
+      exec_args.bias_type = get_dnnl_type<scalar_t>();
+      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
+    } else {
+      exec_args.use_bias = false;
+      exec_args.bias_type = get_dnnl_type<void>();
+      exec_args.bias_ptr = nullptr;
+    }
+    exec_args.a_ptr = a.data_ptr<scalar_t>();
+    exec_args.c_ptr = c.data_ptr<scalar_t>();
+
+    ptr->execute(exec_args);
+  });
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                      const std::optional<torch::Tensor>& bias,
                      int64_t handler);

+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size);
+
+void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
+               const std::optional<torch::Tensor>& bias, int64_t handler);
+
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
          &release_dnnl_matmul_handler);

+  // Create oneDNN GEMM handler
+  ops.def(
+      "create_onednn_mm_handler(Tensor b, int "
+      "primitive_cache_size) -> int",
+      &create_onednn_mm_handler);
+
+  // oneDNN GEMM
+  ops.def(
+      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
+      "int handler) -> ()");
+  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
+
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -19,6 +19,13 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

+#define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
+
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
@ -45,6 +52,15 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))

+#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
+  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))

--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -27,11 +27,12 @@

 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
         bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
 struct Selective_Scan_fwd_kernel_traits {
    static_assert(kNItems_ % 4 == 0);
    using input_t = input_t_;
    using weight_t = weight_t_;
+    using state_t = state_t_;
    static constexpr int kNThreads = kNThreads_;
    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
    cache_index * params.ssm_states_batch_stride + 
    dim_id * kNRows * params.ssm_states_dim_stride;
    
@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                if (threadIdx.x == 0) {
                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
                    if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                    }
                }
                #pragma unroll
@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    }
 }

-template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
    // processing 1 row.
@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                dim3 grid(params.batch, params.dim / kNRows);
                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    });
 }

-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {

    #ifndef USE_ROCM
        if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
        }
    #else
        if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
        }
    #endif
 }

-template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);

 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")

-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
    if (ITYPE == at::ScalarType::Half) {                                            \
        using input_t = at::Half;                                                   \
        using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::Half) {                                        \
+            using state_t = at::Half;                                               \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
        using input_t = at::BFloat16;                                               \
        using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::BFloat16) {                                    \
+            using state_t = at::BFloat16;                                           \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
    } else if (ITYPE == at::ScalarType::Float)  {                                   \
        using input_t = float;                                                      \
        using weight_t = float;                                                     \
+        using state_t = float;                                                      \
        __VA_ARGS__();                                                              \
    } else {                                                                        \
        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
    }


-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);

 void set_ssm_params_fwd(SSMParamsBase &params,
@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,

    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
    at::Tensor out = delta;
-    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    // ssm_states can now be either the same as input_type or float32
+    auto state_type = ssm_states.scalar_type();
+    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
    TORCH_CHECK(ssm_states.is_cuda());
    TORCH_CHECK(ssm_states.stride(-1) == 1);

@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    
    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
    });
 }
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -0,0 +1,758 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
+ * Copyright (c) 2025, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+namespace vllm {
+namespace moe {
+
+constexpr float kNegInfinity = INFINITY * -1;
+constexpr unsigned FULL_WARP_MASK = 0xffffffff;
+constexpr int32_t WARP_SIZE = 32;
+constexpr int32_t BLOCK_SIZE = 512;
+constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
+
+namespace warp_topk {
+
+template <int size, typename T>
+__host__ __device__ constexpr T round_up_to_multiple_of(T len) {
+  if (len == 0) {
+    return 0;
+  }
+  return ((len - 1) / size + 1) * size;
+}
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v) {
+  return (v && !(v & (v - 1)));
+}
+
+template <bool greater, typename T>
+__forceinline__ __device__ bool is_better_than(T val, T baseline) {
+  return (val > baseline && greater) || (val < baseline && !greater);
+}
+
+template <bool greater, typename T, typename idxT>
+__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
+                                               idxT baseline_index) {
+  bool res = (val > baseline && greater) || (val < baseline && !greater);
+  if (val == baseline) {
+    res = (index < baseline_index && greater) ||
+          (index < baseline_index && !greater);
+  }
+  return res;
+}
+
+template <typename T, typename idxT>
+int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
+  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
+  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
+  return max(cache_topk,
+             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
+}
+
+template <int size, bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge {
+  // input should be a bitonic sequence, and sort it to be a monotonic sequence
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    constexpr int stride = arr_len / 2;
+    for (int i = 0; i < stride; ++i) {
+      int const other_i = i + stride;
+      T& val = val_arr[i];
+      T& other_val = val_arr[other_i];
+      bool is_better;
+      if constexpr (is_stable) {
+        is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
+                                              idx_arr[other_i]);
+      } else {
+        is_better = is_better_than<ascending>(val, other_val);
+      }
+
+      if (is_better) {
+        T tmp = val;
+        val = other_val;
+        other_val = tmp;
+
+        idxT tmp2 = idx_arr[i];
+        idx_arr[i] = idx_arr[other_i];
+        idx_arr[other_i] = tmp2;
+      }
+    }
+
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+  }
+};
+
+template <int size, bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
+    BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+    BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+  }
+};
+
+template <bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort<32, ascending, T, idxT, is_stable> {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+
+    // ascending doesn't matter before merging since all we need is a bitonic
+    // sequence
+    for (int stage = 0; stage < 4; ++stage) {
+      for (int stride = (1 << stage); stride > 0; stride /= 2) {
+        bool reverse = (lane >> stage) & 2;
+        bool is_second = lane & stride;
+
+        T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
+        idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
+
+        bool is_better;
+        if constexpr (is_stable) {
+          if constexpr (ascending) {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr < other_idx))) !=
+                        (reverse != is_second);
+          } else {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr > other_idx))) !=
+                        (reverse != is_second);
+          }
+        } else {
+          is_better = (*val_arr != other &&
+                       (*val_arr > other) != (reverse != is_second));
+        }
+        if (is_better) {
+          *val_arr = other;
+          *idx_arr = other_idx;
+        }
+      }
+    }
+
+    BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
+                                                                      idx_arr);
+  }
+};
+
+template <bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+    for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
+      bool is_second = lane & stride;
+      T& val = *val_arr;
+      T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
+      idxT& idx = *idx_arr;
+      idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
+
+      bool is_better;
+      if constexpr (is_stable) {
+        if constexpr (ascending) {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr < other_idx))) ==
+                      (reverse != is_second);  // for min
+        } else {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr > other_idx))) ==
+                      (reverse != is_second);  // for max
+        }
+      } else {
+        is_better =
+            (val != other && ((val > other) == (ascending != is_second)));
+      }
+
+      if (is_better) {
+        val = other;
+        idx = other_idx;
+      }
+    }
+  }
+};
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSort {
+ public:
+  __device__ WarpSort(idxT k, T dummy)
+      : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
+    static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
+
+    for (int i = 0; i < max_arr_len_; ++i) {
+      val_arr_[i] = dummy_;
+      idx_arr_[i] = 0;
+    }
+  }
+
+  // load and merge k sorted values
+  __device__ void load_sorted(T const* __restrict__ in,
+                              idxT const* __restrict__ in_idx, idxT start) {
+    idxT idx = start + WARP_SIZE - 1 - lane_;
+    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
+      if (idx < start + k_) {
+        T t = in[idx];
+        bool is_better;
+        if constexpr (is_stable) {
+          is_better =
+              is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
+        } else {
+          is_better = is_better_than<greater>(t, val_arr_[i]);
+        }
+        if (is_better) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+  }
+
+  __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out[out_i] = val_arr_[i];
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+  __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+ protected:
+  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
+
+  T val_arr_[max_arr_len_];
+  idxT idx_arr_[max_arr_len_];
+
+  int const lane_;
+  idxT const k_;
+  T const dummy_;
+
+};  // end class WarpSort
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
+ public:
+  __device__ WarpSelect(idxT k, T dummy)
+      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
+        k_th_(dummy),
+        k_th_lane_((k - 1) % WARP_SIZE) {
+    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
+
+    int const num_of_warp = blockDim.x / WARP_SIZE;
+    int const warp_id = threadIdx.x / WARP_SIZE;
+    val_smem_ = reinterpret_cast<T*>(smem_buf);
+    val_smem_ += warp_id * WARP_SIZE;
+    idx_smem_ = reinterpret_cast<idxT*>(
+        smem_buf +
+        round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
+    idx_smem_ += warp_id * WARP_SIZE;
+  }
+
+  __device__ void add(T const* in, idxT start, idxT end) {
+    idxT const end_for_fullwarp =
+        round_up_to_multiple_of<WARP_SIZE>(end - start) + start;
+    for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
+      T val = (i < end) ? in[i] : dummy_;
+      add(val, i);
+    }
+  }
+
+  __device__ void add(T val, idxT idx) {
+    bool do_add;
+    if constexpr (is_stable) {
+      do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
+    } else {
+      do_add = is_better_than<greater>(val, k_th_);
+    }
+
+    uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
+    if (mask == 0) {
+      return;
+    }
+
+    int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
+    if (do_add && pos < WARP_SIZE) {
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+      do_add = false;
+    }
+    smem_buf_len_ += __popc(mask);
+    if (smem_buf_len_ >= WARP_SIZE) {
+      __syncwarp();
+      merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
+      smem_buf_len_ -= WARP_SIZE;
+    }
+    if (do_add) {
+      pos -= WARP_SIZE;
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+    }
+    __syncwarp();
+  }
+
+  __device__ void done() {
+    if (smem_buf_len_) {
+      T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
+      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
+      merge_buf_(val, idx);
+    }
+
+    // after done(), smem is used for merging results among warps
+    __syncthreads();
+  }
+
+ private:
+  __device__ void set_k_th_() {
+    k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
+    if constexpr (is_stable) {
+      k_th_idx_ =
+          __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
+    }
+  }
+
+  __device__ void merge_buf_(T val, idxT idx) {
+    BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
+
+    T& old = val_arr_[max_arr_len_ - 1];
+
+    bool is_better;
+    if constexpr (is_stable) {
+      is_better =
+          is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
+    } else {
+      is_better = is_better_than<greater>(val, old);
+    }
+
+    if (is_better) {
+      old = val;
+      idx_arr_[max_arr_len_ - 1] = idx;
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+
+    set_k_th_();
+  }
+
+  using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
+
+  T* val_smem_;
+  idxT* idx_smem_;
+  int smem_buf_len_ = 0;
+
+  T k_th_;
+  idxT k_th_idx_;
+  int const k_th_lane_;
+};  // end class WarpSelect
+}  // namespace warp_topk
+
+template <typename T_OUT, typename T_IN>
+__device__ inline T_OUT cuda_cast(T_IN val) {
+  return val;
+}
+
+template <>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input,
+                             cg::thread_block_tile<32> const& tile,
+                             int32_t const lane_id,
+                             int const num_experts_per_group) {
+  // Get the top2 per thread
+  T largest = -INFINITY;
+  T second_largest = -INFINITY;
+
+  if (num_experts_per_group > WARP_SIZE) {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      T value = input[i];
+      if (value > largest) {
+        second_largest = largest;
+        largest = value;
+      } else if (value > second_largest) {
+        second_largest = value;
+      }
+    }
+  } else {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      largest = input[i];
+    }
+  }
+
+  __syncwarp();  // Ensure all threads have valid data before reduction
+  // Get the top2 warpwise
+  T max1 = cg::reduce(tile, largest, cg::greater<T>());
+
+  T max2 = max1;
+  bool equal_to_max1 = (max1 == largest);
+
+  int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
+
+  if (count_max1 == 1) {
+    largest = (largest == max1) ? second_largest : largest;
+    max2 = cg::reduce(tile, largest, cg::greater<T>());
+  }
+
+  if (lane_id == 0) {
+    *output = max1 + max2;
+  }
+}
+
+template <typename T>
+__global__ void topk_with_k2_kernel(T* output, T* input,
+                                    int64_t const num_tokens,
+                                    int64_t const num_cases,
+                                    int64_t const n_group,
+                                    int64_t const num_experts_per_group) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+
+  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
+  if (case_id < num_cases) {
+    input += case_id * num_experts_per_group;
+    output += case_id;
+
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+__global__ void group_idx_and_topk_idx_kernel(
+    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
+    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool renormalize,
+    double routed_scaling_factor) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+  int32_t case_id =
+      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
+  scores_with_bias += case_id * num_experts;
+  scores += case_id * num_experts;
+  group_scores += case_id * n_group;
+  topk_values += case_id * topk;
+  topk_indices += case_id * topk;
+
+  int32_t align_num_experts_per_group =
+      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+
+  cg::thread_block block = cg::this_thread_block();
+  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
+                                      // store the target topk idx
+  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
+  T* s_topk_value =
+      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
+      warp_id * topk;
+  s_topk_idx += warp_id * topk;
+
+  T value = kNegInfinity;
+  T topk_group_value = kNegInfinity;
+  int32_t num_equalto_topkth_group;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
+                                         // acqbulk because it's ptr arithmetic
+#endif
+
+  if (case_id < num_tokens) {
+    // calculate group_idx
+    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    if (lane_id < n_group &&
+        (isfinite(cuda_cast<float, T>(
+            group_scores[lane_id]))))  // The check is necessary to avoid
+                                       // abnormal input
+    {
+      value = group_scores[lane_id];
+    }
+
+    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int pre_count_equal_to_top_value = 0;
+    // Use loop to find the largset top_group
+    while (count_equal_to_top_value < target_num_min) {
+      __syncwarp();  // Ensure all threads have valid data before reduction
+      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
+      if (value == topk_group_value) {
+        value = kNegInfinity;
+      }
+      pre_count_equal_to_top_value = count_equal_to_top_value;
+      count_equal_to_top_value = __popc(__ballot_sync(
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
+    }
+    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
+  }
+  __syncthreads();
+
+  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+                        /* is_stable */ true>
+      queue((int32_t)topk, -INFINITY);
+
+  int count_equalto_topkth_group = 0;
+  bool if_proceed_next_topk =
+      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i_group = 0; i_group < n_group; i_group++) {
+      if ((group_scores[i_group] > topk_group_value) ||
+          ((group_scores[i_group] == topk_group_value) &&
+           (count_equalto_topkth_group < num_equalto_topkth_group))) {
+        int32_t offset = i_group * num_experts_per_group;
+        for (int32_t i = lane_id; i < align_num_experts_per_group;
+             i += WARP_SIZE) {
+          T candidates =
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
+                                                 scores_with_bias[offset + i]))
+                  ? scores_with_bias[offset + i]
+                  : cuda_cast<T, float>(kNegInfinity);
+          queue.add(candidates, offset + i);
+        }
+        if (group_scores[i_group] == topk_group_value) {
+          count_equalto_topkth_group++;
+        }
+      }
+    }
+    queue.done();
+    __syncwarp();
+    // Get the topk_idx
+    queue.dumpIdx(s_topk_idx);
+    __syncwarp();
+  }
+
+  // Load the valid score value
+  // Calculate the summation
+  float topk_sum = 1e-20;
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i = lane_id;
+         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
+         i += WARP_SIZE) {
+      T value =
+          i < topk
+              ? scores[s_topk_idx[i]]
+              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      if (i < topk) {
+        s_topk_value[i] = value;
+      }
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+    }
+  }
+
+  __syncthreads();
+
+  if (case_id < num_tokens) {
+    if (if_proceed_next_topk) {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        float value;
+        if (renormalize) {
+          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
+                  routed_scaling_factor;
+        } else {
+          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
+        }
+        topk_indices[i] = s_topk_idx[i];
+        topk_values[i] = cuda_cast<T, float>(value);
+      }
+    } else {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        topk_indices[i] = i;
+        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+      }
+    }
+    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
+    // default result.
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
+                   IdxT* topk_indices, T* scores_with_bias,
+                   int64_t const num_tokens, int64_t const num_experts,
+                   int64_t const n_group, int64_t const topk_group,
+                   int64_t const topk, bool const renormalize,
+                   double const routed_scaling_factor, bool enable_pdl = false,
+                   cudaStream_t const stream = 0) {
+  int64_t num_cases = num_tokens * n_group;
+  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
+  cudaLaunchConfig_t config;
+  config.gridDim = topk_with_k2_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group);
+
+  int64_t topk_with_k_group_num_blocks =
+      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
+  size_t dynamic_smem_in_bytes =
+      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
+                                                           topk);
+  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
+  config.gridDim = topk_with_k_group_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = dynamic_smem_in_bytes;
+  config.stream = stream;
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                     topk_values, topk_indices, scores_with_bias, num_tokens,
+                     n_group, topk_group, topk, num_experts,
+                     num_experts / n_group, renormalize, routed_scaling_factor);
+}
+
+#define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
+  template void invokeNoAuxTc<T, IdxT>(                                     \
+      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
+      T * scores_with_bias, int64_t const num_tokens,                       \
+      int64_t const num_experts, int64_t const n_group,                     \
+      int64_t const topk_group, int64_t const topk, bool const renormalize, \
+      double const routed_scaling_factor, bool enable_pdl,                  \
+      cudaStream_t const stream);
+
+INSTANTIATE_NOAUX_TC(float, int32_t);
+INSTANTIATE_NOAUX_TC(half, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
+}  // end namespace moe
+}  // namespace vllm
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor) {
+  auto data_type = scores_with_bias.scalar_type();
+  auto input_size = scores_with_bias.sizes();
+  int64_t num_tokens = input_size[0];
+  int64_t num_experts = input_size[1];
+  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(num_experts % n_group == 0,
+              "num_experts should be divisible by n_group");
+  TORCH_CHECK(n_group <= 32,
+              "n_group should be smaller than or equal to 32 for now");
+  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+
+  torch::Tensor group_scores = torch::empty(
+      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_values = torch::empty(
+      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_indices = torch::empty(
+      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+
+  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+
+  switch (data_type) {
+    case torch::kFloat16:
+      // Handle Float16
+      vllm::moe::invokeNoAuxTc<half, int32_t>(
+          reinterpret_cast<half*>(scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kFloat32:
+      // Handle Float32
+      vllm::moe::invokeNoAuxTc<float, int32_t>(
+          reinterpret_cast<float*>(scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kBFloat16:
+      // Handle BFloat16
+      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
+          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
+          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    default:
+      // Handle other data types
+      throw std::invalid_argument(
+          "Invalid dtype, only supports float16, float32, and bfloat16");
+      break;
+  }
+  return {topk_values, topk_indices};
+}
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -22,6 +22,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor num_tokens_post_pad, int64_t top_k,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit);
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor);
 #endif

 bool moe_permute_unpermute_supported();
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -573,7 +573,7 @@ void topk_softmax(
            stream);
    }
    else {
-        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -78,6 +78,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "output_tensor) -> ()");
  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);

+  // Apply grouped topk routing to select experts.
+  m.def(
+      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "topk_group, int topk, bool renormalize, float "
+      "routed_scaling_factor) -> (Tensor, Tensor)");
+  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }

--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);

+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant(torch::Tensor& out,
+                              torch::Tensor& output_block_scale,
+                              torch::Tensor& input,
+                              torch::Tensor& input_global_scale);
+#endif
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@ -11,6 +11,7 @@
 #include "core/registration.h"

 #include "cutlass/cutlass.h"
+#include <limits>

 #include "cute/tensor.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
@ -169,6 +170,11 @@ struct W4A8GemmKernel {
    int k = A.size(1);
    int n = B.size(1);

+    // safely cast group_size to int
+    TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits<int>::max(),
+                "group_size out of supported range for int: ", group_size);
+    int const group_size_int = static_cast<int>(group_size);
+
    // Allocate output
    const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
    auto device = A.device();
@ -181,7 +187,7 @@ struct W4A8GemmKernel {
    auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
    auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
    auto D_ptr = static_cast<ElementD*>(D.data_ptr());
-    // can we avoid harcode the 8 here
+    // can we avoid hardcode the 8 here
    auto S_ptr =
        static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
            group_scales.const_data_ptr());
@ -192,7 +198,7 @@ struct W4A8GemmKernel {
        cute::tile_to_shape(LayoutAtomQuant{}, shape_B);

    // strides
-    int const scale_k = cutlass::ceil_div(k, group_size);
+    int const scale_k = cutlass::ceil_div(k, group_size_int);
    StrideA stride_A =
        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
    // Reverse stride here due to swap and transpose
@ -211,8 +217,8 @@ struct W4A8GemmKernel {
    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;

    MainloopArguments mainloop_arguments{
-        B_ptr, layout_B_reordered, A_ptr,     stride_A,
-        S_ptr, stride_S,           group_size};
+        B_ptr, layout_B_reordered, A_ptr,         stride_A,
+        S_ptr, stride_S,           group_size_int};

    EpilogueArguments epilogue_arguments{
        ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+
+namespace vllm {
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = c10::Half;
+};
+
+template <>
+struct TypeConverter<c10::Half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = c10::BFloat16;
+};
+
+template <>
+struct TypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
+                                                   PackedVec<Type>& vec2) {
+  PackedVec<Type> result;
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      half2 val(0.5f, 0.5f);
+      half2 t0 = __hmul2(vec.elts[i], val);
+      half2 t1 = __hfma2(h2tanh(t0), val, val);
+      half2 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    } else {
+      __nv_bfloat162 val(0.5f, 0.5f);
+      __nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
+      __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
+      __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    }
+  }
+  return result;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
+                                                  PackedVec<Type>& vec2,
+                                                  float SFScaleVal,
+                                                  uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  PackedVec<Type> out_silu = compute_silu(vec, vec2);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(out_silu.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      fp2Vals[i] = __half22float2(out_silu.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
+#else
+silu_and_cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset =
+          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
+                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
+
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      ;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
+          in_vec, in_vec2, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+}  // namespace vllm
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
+                              torch::Tensor& output_sf,
+                              torch::Tensor& input,  // [..., 2 * d]
+                              torch::Tensor& input_sf) {
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  int32_t m = input.size(0);
+  int32_t n = input.size(1) / 2;
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
+        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
+        VLLM_DISPATCH_BYTE_TYPES(
+            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
+            [&] {
+              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
+                  <<<grid, block, 0, stream>>>(
+                      m, n, input_ptr, input_sf_ptr,
+                      reinterpret_cast<uint32_t*>(output_ptr),
+                      reinterpret_cast<uint32_t*>(sf_out));
+            });
+      });
+}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -115,6 +115,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  ops.def(
+      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
+      "Tensor input, Tensor input_global_scale) -> ()");
+  ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant);
+#endif
+
  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);

@ -686,6 +694,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);

+  cache_ops.def(
+      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                              Tensor cp_local_token_select_indices,"
+      "                              Tensor! kv_cache,"
+      "                              Tensor slot_mapping,"
+      "                              str kv_cache_dtype,"
+      "                              Tensor scale) -> ()");
+  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
+                 &cp_fused_concat_and_cache_mla);
+
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
@ -702,6 +720,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                               Tensor scale, Tensor? seq_starts) -> ()");
  cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
                 &gather_and_maybe_dequant_cache);
+
+  cache_ops.def(
+      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -237,7 +237,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
+ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -261,6 +261,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

+# Install libnuma-dev, required by fastsafetensors (fixes #20384)
+RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@ -373,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.3.0"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
@ -432,11 +434,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
-    && rm /tmp/install_deepgemm.sh
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 

 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
--- a/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
+++ b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
--- a/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
+++ b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
--- a/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
+++ b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
--- a/docs/assets/design/hybrid_kv_cache_manager/overview.png
+++ b/docs/assets/design/hybrid_kv_cache_manager/overview.png
--- a/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
+++ b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@ -2,7 +2,9 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",

 If you run out of CPU RAM, try the following options:

- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).

 ## Multi-modal input limits
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -164,15 +164,20 @@ llm = LLM(
 )
 ```

-!! important
+!!! important
    Batch-level DP is not to be confused with API request-level DP
    (which is instead controlled by `data_parallel_size`).

-The availablilty of batch-level DP is based on model implementation.
-Currently, the following models support `mm_encoder_tp_mode="data"`:
+Batch-level DP needs to be implemented on a per-model basis,
+and enabled by setting `supports_encoder_tp_data = True` in the model class.
+Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.

+Known supported models:
+
+- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
+- Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
- MiniCPM-V-4 (<gh-pr:23327>)
+- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)

@ -204,20 +209,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
    to avoid CPU resource exhaustion.

 !!! note
-    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondance between API and engine core processes.
+    API server scale-out disables [multi-modal IPC caching](#ipc-caching)
+    because it requires a one-to-one correspondence between API and engine core processes.
+
+    This does not impact [multi-modal processor caching](#processor-caching).

 ## Multi-Modal Caching

-### Processor Cache
-
-By default, the multi-modal processor cache is enabled to avoid repeatedly processing
-the same multi-modal inputs via Hugging Face `AutoProcessor`,
+Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
 which commonly occurs in multi-turn conversations.

-You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
-(default 4 GiB per API process + 4 GiB per engine core process).
-If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
+### Processor Caching
+
+Multi-modal processor caching is automatically enabled
+to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.
+
+### IPC Caching
+
+Multi-modal IPC caching is automatically enabled when
+there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes,
+to avoid repeatedly transferring the same multi-modal inputs between them.
+
+### Configuration
+
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
+
+If you do not benefit much from the cache, you can disable both IPC
+and processor caching completely via `mm_processor_cache_gb=0`.

 Examples:

@ -230,3 +248,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          mm_processor_cache_gb=0)
 ```
+
+### Cache Placement
+
+Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
+
+| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|
+| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
+| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| ❌ | ❌ | N/A | N/A | `0` |
+
+K: Stores the hashes of multi-modal items  
+V: Stores the processed tensor data of multi-modal items
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@ -45,32 +45,32 @@ This initial compilation time ranges significantly and is impacted by many of th

 ### Optimize based on your data

-#### max model len vs. most model len
+#### max-model-len vs. most-model-len

 ![most_model_len](../assets/design/tpu/most_model_len.png)

-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.

 For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.

-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.

 #### Padding

-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)

-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:

-1) the default exponential padding (pad to the nearest power of 2)
-2) bucket padding (pad to the nearest linearly increasing bucket).
+1. the default exponential padding (pad to the nearest power of 2)
+2. bucket padding (pad to the nearest linearly increasing bucket).

 When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.

 For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].

-The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.

-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.

 #### Quantization

--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:

-1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
+1. Increase the timeout limit to 10 hours so that the build doesn't time out.
 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
 to warm it up so that future builds are faster.

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).

 With these two steps, interleave sliding windows should work with the model.
+
+### How to support models that use Mamba?
+
+We consider 3 different scenarios:
+
+1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
+2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
+3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
+
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
+For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
+V0-only classes and code will be removed in the very near future.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
+
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+Please follow the same guidelines as case (2) for implementing these models.
+We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
+It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
+Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
+Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -855,7 +855,7 @@ Examples:

 ### Custom HF processor

-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].

 Examples:

--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -73,6 +73,8 @@ apt install nsight-systems-cli

 ### Example commands and usage

+When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+
 #### Offline Inference

 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy

 One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.

-It supports vLLM as a AI model provider to efficiently serve large language models.
+It supports vLLM as an AI model provider to efficiently serve large language models.

 For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
    metadata:
      name: vllm
    spec:
-      replicas: 2
+      replicas: 1
      leaderWorkerTemplate:
        size: 2
        restartPolicy: RecreateGroupOnPodRestart
@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
                  - sh
                  - -c
                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                    vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                resources:
                  limits:
                    nvidia.com/gpu: "8"
@ -126,8 +126,6 @@ Should get an output similar to this:
 NAME       READY   STATUS    RESTARTS   AGE
 vllm-0     1/1     Running   0          2s
 vllm-0-1   1/1     Running   0          2s
-vllm-1     1/1     Running   0          2s
-vllm-1-1   1/1     Running   0          2s
 ```

 Verify that the distributed tensor-parallel inference works:
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -380,7 +380,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)

 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"

-If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
+If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:

 1. container log contains "KeyboardInterrupt: terminated"
 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -54,8 +54,8 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts

 ### FusedMoEPrepareAndFinalize

-The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions.
-The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
+The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
+The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)

 ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")

@ -133,12 +133,12 @@ class FusedMoEModularKernel:
 Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,

 * PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
-* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
 * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.

 #### Step 1: Add an All2All manager

-The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).

 #### Step 2: Add a FusedMoEPrepareAndFinalize Type

@ -146,6 +146,10 @@ This section describes the significance of the various functions exposed by the

 `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked.

+`FusedMoEPrepareAndFinalize::has_prepare_no_receive()`: Indicates whether or not this subclass implements `prepare_no_receive`. Defaults to False.
+
+`FusedMoEPrepareAndFinalize::prepare_no_receive()`: The prepare_no_receive method implements the Quantization and All2All Dispatch. It does not wait for the result of the dispatch operation but instead returns a thunk that can be invoked to wait for the final results. Typically the Dispatch function from the relevant All2All Manager is invoked.
+
 `FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked.

 `FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise.
@ -183,7 +187,7 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking

 #### maybe_make_prepare_finalize

-The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+The `maybe_make_prepare_finalize` method is responsible for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
 Please refer to the implementations in,

 * `ModelOptNvFp4FusedMoE`
@ -198,7 +202,7 @@ Please refer to the implementations in,
 * `CompressedTensorsW8A8Fp8MoECutlassMethod`
 * `Fp8MoEMethod`
 * `ModelOptNvFp4FusedMoE`
-dervied classes.
+derived classes.

 #### init_prepare_finalize

--- a/docs/design/hybrid_kv_cache_manager.md
+++ b/docs/design/hybrid_kv_cache_manager.md
@ -0,0 +1,245 @@
+# Hybrid KV Cache Manager
+
+!!! warning
+    This document was written based on commit [458e74](https://github.com/vllm-project/vllm/commit/458e74eb907f96069e6d8a4f3c9f457001fef2ea). This feature is still in its early stage and things may change.
+
+## What is a hybrid model?
+
+Many recent "hybrid" LLMs combine multiple attention types within one model. For example:
+
+1. Sliding window attention (sw) + full attention (full): gpt-oss, Gemma 2/3, Ministral, cohere, etc.
+2. Mamba + full: Bamba, Jamba, Minimax, etc.
+3. Local chunked attention + full: Llama4
+
+To serve these models efficiently, our [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] must:
+
+1. Allocate different slots to different layer type, for example:
+    - Full attention layers: reserve slots for **all** tokens.
+    - Sliding window layers: reserve slots only for the most recent **`sliding_window_size`** tokens.
+2. Support layer-specific prefix-cache rules, for example:
+    - Full attention: a cache hit prefix requires **all** tokens remain in the KV cache.
+    - Sliding window: a cache hit prefix only requires the last **`sliding_window_size`** tokens remain in the KV cache.
+
+## Definitions
+
+1. **kv hidden size**: The number of bytes to store one token's KV cache for a single layer.
+2. **block**: the memory reserved for kv cache are divided into multiple *blocks* with the same *page size* (defined below)
+3. **block size**: number of tokens inside a block
+4. **page size**: the physical memory size of a block, defined as:
+
+    $$
+    \text{num_layers} \times \text{block_size} \times \text{kv_hidden_size}
+    $$
+
+    `num_layers` doesn't mean the total number of layers in the model. The exact number depends on the context in this doc.
+
+    !!! note
+        This is different from `KVCacheSpec.page_size_bytes` in the code, which is defined as:
+
+        $$
+        \text{block_size} \times \text{kv_hidden_size}
+        $$
+
+## Allocation
+
+### High level idea
+
+We use a single memory pool for all layer types. The memory pool is split into multiple blocks with the same page size. [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates different numbers of blocks to different layers according to its attention type.
+
+The core challenge is ensuring every layer type uses the same **page size**.  For full-attention-only models, the page size is straightforward, defined as:
+
+$$
+\text{page_size} = \text{block_size} \times \text{num_hidden_layers} \times \text{kv_hidden_size}
+$$
+
+However, in hybrid models, `num_hidden_layers` varies by attention type, which would normally produce mismatched page sizes. The cases below show how we unify them.
+
+### Case 1: toy model
+
+Let's start with a toy example: a model has 1 full attention layer and 3 sliding window attention layers. All layers have the same `kv_hidden_size`.
+
+We let each block to hold `block_size` tokens for one layer, so:
+
+$$
+\text{page_size} = \text{kv_hidden_size} \times \text{block_size}
+$$
+
+[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates a different number of blocks to each layer.
+
+This case is only a toy example. For real models, please refer to the following cases.
+
+### Case 2: same `kv_hidden_size` and a regular pattern
+
+When the model has more layers, e.g., 20 sliding window attention layers and 10 full attention layers with the same `kv_hidden_size`. Calling the allocator once per layer (30 calls) is OK but becomes inefficient. As a solution, we group the allocation of layers that need the same number of blocks to reduce the number of calls.
+
+The grouping is feasible because there is usually a beautiful ratio between the number of different types of layers. For example:
+
+- Gemma-2: 1 sw : 1 full
+- Llama 4: 3 local : 1 full
+
+Our example can be regarded as 2 sw : 1 full. We can allocate blocks as if there are 2 sw and 1 full in the model, and repeat the result by 10 times to generate the `block_ids` for the 30 layers. The page size becomes:
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+Assume `block_size` 16, sliding window size 32, request length 112, then for the above example model, we need to allocate 11 blocks (0-6 for full, 7-8 for sw group 1, 9-10 for sw group 2).
+
+![Allocation Result](../assets/design/hybrid_kv_cache_manager/basic_grouping_example.png)
+
+Here, "/" denotes no block needed (sliding‑window layers don't need slots for early tokens).
+
+See the formal definition below. The layers are divided into multiple *KV Cache Groups* so that there is:
+
+1. **Identical attention type inside each group**: Each group only contains layers with the same attention type and thus need the same number of blocks for a given request. This enables layers in the same group share the same block ids without memory waste.
+2. **Identical page size across groups**: Because our memory pool only have one page size.
+
+Our example model is divided into 3 KV cache groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+Obviously, it satisfies rule 1. For rule 2, all 3 groups have
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+as their page size.
+
+### Case 3: same `kv_hidden_size` and no regular pattern
+
+Unfortunately, not all models have such a beautiful ratio, and approach in Case 2 will produce too many small groups. For example, Gemma-3-27b has 52 sliding window attention layers and 10 full attention layers. With the constraints in case 2, it would be 26 sliding window groups and 5 full attention groups, each contains 2 layers. The allocation is still inefficient. To reduce the number of kv cache groups, we group layers using the smallest layer count among all attention types. For example, min(52, 10)=10 layers per group in Gemma-3-27b. Then the grouping result is:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+- ...
+- Group 6: 10 sliding window attention layers (sw.40 - sw.49)
+- Group 7: 2 sliding window attention layers (sw.50 - sw.51) and 8 padding layers
+
+We will update this algorithm if this heuristic leads to a bad result when a new model comes out (e.g., 20 full + 30 sw, the group size should be 10 instead of 20).
+
+This case happens in Gemma-3 series models, and models in case 2 but with eagle speculative decoding which introduce one full attention layer. The solution has some memory waste and is not perfect. Please report any cases where padding overhead becomes unacceptable so we can refine the algorithm.
+
+### Case 4: different `kv_hidden_size` (mainly hybrid mamba models)
+
+Some architectures (e.g., Bamba, Jamba, Minimax) interleave standard attention layers with Mamba layers, where each Mamba layer's state size per token can be much larger than the attention layers' `kv_hidden_size`. Because we only support a single page size across all groups, we must reconcile these differing hidden sizes.
+
+The current algorithm is:
+
+1. Increase the `block_size` of attention layers until
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \ge \text{state_size}_{\text{mamba}}
+    $$
+2. Pad the mamba state per layer to
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}}
+    $$
+3. Apply the grouping strategy in case 3.
+
+!!! note
+    This can lead to more than 400 `block_size` for attention layers, which is too large. Another padding strategy is to increase `block_size` until
+
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \times \text{num_attn_layers} \ge \text{state_size}_{\text{mamba}}
+    $$
+
+    This padding strategy is still a work in progress.
+
+### Case 5: KV sharing
+
+KV sharing refers to a layer using the KV cache of another layer, e.g., gemma-3n.
+In these models, [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] ignores all layers with kv sharing and only allocates KV cache for layers that need kv cache, and some patches are made in model runner to apply the allocation result to kv sharing layers.
+
+## Prefix caching
+
+For simplicity, we assume `block_size=1` in this section.
+
+### High level idea
+
+The block pool uses a dict similar to `tuple(block_hash, group_id) -> block` to catch the full blocks. That means the same tokens of different groups are cached and evicted independently.
+
+When a new request comes in, we check the cache hit prefix of each group, and return the intersection of these groups as the cached prefix of the request. See below for the detailed algorithm for checking the cache hit of one group & performing the intersection.
+
+### Case 0: full attention only models
+
+For full attention layers, blocks are allocated for all tokens in the request. For details on the underlying design, see [Prefix Caching](prefix_caching.md)
+
+To find the longest cache hit prefix of a request, we enumerate from left (the first block) to right (the last block), checking whether the block is cached, and exit when cache misses. For example, we will return the first 7 tokens (0-6) as the cache hit prefix in the below example (blue blocks are cached):
+
+![Prefix Caching of Full Attention](../assets/design/hybrid_kv_cache_manager/full_attn.png)
+
+### Case 1: sliding window attention only models
+
+For sliding window attention layers, a naive implementation for memory allocation is to allocate `sliding_window_size` blocks and fill in the blocks in a round-robin way. But this naive implementation is not compatible with prefix caching so we didn't pick this design. In vLLM,  we allocate different blocks for different tokens and free blocks that are outside the sliding window.
+
+For a new request, the cache hit prefix only requires the last `sliding_window_size - 1` tokens being cached.
+Let's say `sliding_window_size = 4` and `block_size = 1`, and the request is a 15-token prompt (blue blocks are cached):
+
+![Prefix Caching of Sliding Window Attention](../assets/design/hybrid_kv_cache_manager/sw_attn.png)
+
+There are 3 possible cache hit prefixes:
+
+- cache hit length 5, compute prefill with [2, 3, 4] → [5, 6, …, 14]
+- cache hit length 6, compute prefill with [3, 4, 5] → [6, 7, …, 14]
+- cache hit length 14, compute prefill with [11, 12, 13] → [14] (most efficient)
+
+We can check the cache hit from right to left, and early exit when we find a match.This is opposite from full attention, where we check from left to right and early exit when the match fails. One potential cons (compared to full attention) is that we end up iterating over the entire list of tokens when there's no match, which is often a common case. This could potentially cause non-negligible overheads, but fine with full + swa, as discussed below.
+
+### Case 2: sliding window attention + full attention models
+
+The first problem is how to find the cache hit prefix. We need to "intersect" the cache hits of global and sliding window attention layers by:
+
+1. Get the longest cache hit for full attention (scanning from left to right)
+2. Get the longest cache hit for sliding window attention that is within that length. Implemented by checking cache hits from right to left starting from the cache hit length of full attention.
+
+It can be ensured that the resulting cache hit of sliding window attention layers is also a cache hit of full attention layers. This is more efficient than finding all possible prefixes of each group and doing the intersection, because our approach can exit early if there is no cache hit.
+
+The algorithm applies to models with exactly two attention types full attention + X, where X can be an arbitrary efficient attention algorithm like sliding window, llama 4 local attention, and mamba. It doesn't support models without full attention layers, and models with more than 2 types of attention. This is enough for most hybrid models at the moment of writing this doc.
+
+The second question is the cache eviction policy. For now, we use one LRU queue for all kv cache groups. The blocks are added to the LRU queue when freed, either because the request is finished or the block is out of the sliding window.
+
+### Case 3: mamba models
+
+The prefix caching support of the mamba model is work in progress. Once implemented, models with mamba layer + full attention layer can be supported via the full attention + X algorithm in case 2.
+
+## Implementation
+
+### Overview
+
+![Overview of Hybrid KV Cache Manager](../assets/design/hybrid_kv_cache_manager/overview.png)
+
+The `KVCacheManager` is organized into 3 layers:
+
+- **[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager]**: The interface between the scheduler and kv cache management system.
+- **[KVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinator]**: coordinate per-group SingleTypeKVCacheManagers to generate the allocation result of a request. Depending on the model's configuration, one of these coordinators is chosen:
+    - **[KVCacheCoordinatorNoPrefixCache][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinatorNoPrefixCache]**: Used when prefix caching is disabled.
+    - **[UnitaryKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.UnitaryKVCacheCoordinator]**: If only one KV cache group. The prefix caching logic is simplified as no intersection is needed.
+    - **[HybridKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.HybridKVCacheCoordinator]**: Handles exactly two KV cache groups (must include one full‑attention group plus one other efficient‑attention group). Other cases are not implemented. You can disable prefix caching to use the KVCacheCoordinatorNoPrefixCache.
+- **[SingleTypeKVCacheManager][vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager]**: Each instance manages allocation and prefix caching for one KV cache group, implementing the attention‑type–specific logic (e.g., full attention, sliding window, Mamba).
+
+The blue box in the above figure shows the case with 10 full attention layers and 20 sliding window attention layers, thus:
+
+- use `HybridKVCacheCoordinator`
+- use 1 `FullAttentionManager` and 2 `SlidingWindowManager` for the 3 `KVCacheGroup`s.
+
+### Memory Layout
+
+For a model with n `KVCacheGroup`s, each with m layers, we allocate m buffers. Each buffer is shared by n layers, one from each group.
+
+The following figure is for a model with 10 full attention layers (full.0 - full.9) and 20 sliding window attention layers (sw.0-sw.19). It follows "case 2" in "Allocation" section and is divided into 3 groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+And for a request, we allocate 11 blocks with `block_id` 0-6 to group 0, 7-8 to group 1, and 9-10 to group 2.
+
+With such an example, the physical memory is divided into 10 buffers (`KVCacheTensor` 0 - `KVCacheTensor` 9). Each buffer is shared by 3 layers (e.g., `KVCacheTensor` 0 is shared by full.0 from group 0, sw.0 from group 1, and sw.10 from group 2) and is divided into pieces with size `block_size * kv_hidden_size`. The KV cache of these 3 attention layers are saved to different pieces of the buffer based on the allocated `block_ids`:
+
+![Example Memory Layout](../assets/design/hybrid_kv_cache_manager/memory_layout.png)
+
+!!! note
+    One logic "block" is mapped to 10 pieces in the 10 buffers of the physical memory.
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -0,0 +1,78 @@
+# IO Processor Plugins
+
+IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+
+## Writing an IO Processor Plugin
+
+IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+
+```python
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
+```
+
+The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
+The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
+The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
+
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+
+## Using an IO Processor plugin
+
+IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
+
+1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
+2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
+
+The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201

 ### Multi-process Mode

-In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.

 ### Built in Python/Process Metrics

-The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
+The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:

 - `python_gc_objects_collected_total`
 - `python_gc_objects_uncollectable_total`
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -49,6 +49,8 @@ Every plugin has three parts:

 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+
 ## Guidelines for Writing Plugins

 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@ -2,6 +2,6 @@

 vLLM's examples are split into three categories:

- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -52,7 +52,7 @@ Check out <gh-file:examples/offline_inference/multilora_inference.py> for an exa
 ## Serving LoRA Adapters

 LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:

 ```bash
 vllm serve meta-llama/Llama-2-7b-hf \
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].

+### Stable UUIDs for Caching (multi_modal_uuids)
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ### Image Inputs

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -4,7 +4,6 @@ Quantization trades off model precision for smaller memory footprint, allowing l

 Contents:

- [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
 - [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
@ -19,3 +18,50 @@ Contents:
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
+
+## Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+<style>
+td:not(:first-child) {
+  text-align: center !important;
+}
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+
+th:not(:first-child) {
+  writing-mode: vertical-lr;
+  transform: rotate(180deg)
+}
+</style>
+
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
 !!! note
    Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
    Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
-    For details see [supported hardware](supported_hardware.md).
+    For details see [supported hardware](README.md#supported-hardware).

 Below are the steps to utilize BitBLAS with vLLM.

--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@ -1,32 +0,0 @@
-# Supported Hardware
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-<style>
-th {
-  white-space: nowrap;
-  min-width: 0 !important;
-}
-</style>
-
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
-| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
-
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
- ✅︎ indicates that the quantization method is supported on the specified hardware.
- ❌ indicates that the quantization method is not supported on the specified hardware.
-
-!!! note
-    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att
            print(content, end="", flush=True)
    ```

-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

 ## Tool Calling

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr

 At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).

-For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct`

 Here is a simple example demonstrating how to get structured output using Pydantic models:

--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
    - [Apple silicon](cpu.md#apple-silicon)
    - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
- [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)

 ## Hardware Plugins
--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you

 - `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
  compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
+  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
  under this specified path.
 - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
 - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).

@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`

-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.

 ### Which quantization configs does vLLM CPU support?

@ -193,3 +194,35 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
 - Both of them require `amx` CPU flag.
    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+
+### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
+
+In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
+
+To enable these optimizations inside Docker with the least privilege, you can follow below tips:
+
+```bash
+docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined  ...
+
+# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue.
+
+# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`.
+# Actually, `seccomp=unconfined` bypasses the seccomp for container,
+# if it's unacceptable, you can customize your own seccomp profile,
+# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list.
+
+# reference : https://docs.docker.com/engine/security/seccomp/
+```
+
+Alternatively, running with `--privileged=true` also works but is broader and not generally recommended.
+
+In K8S, the following configuration can be added to workload yaml to achieve the same effect as above:
+
+```yaml
+securityContext:
+  seccompProfile:
+    type: Unconfined
+  capabilities:
+    add:
+    - SYS_NICE
+```
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@ -1,6 +1,6 @@
 # --8<-- [start:installation]

-vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
+vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS.

 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.

--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@ -48,6 +48,10 @@ docker run --rm \
            --dtype=bfloat16 \
            other vLLM OpenAI server arguments
 ```
+
+!!! tip
+    An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@ -16,8 +16,8 @@ cd vllm_source
 Third, install required dependencies:

 ```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend auto
-uv pip install -r requirements/cpu.txt --torch-backend auto
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```

 ??? console "pip"
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@ -89,6 +89,9 @@ docker run --rm \
    other vLLM OpenAI server arguments
 ```

+!!! tip
+    An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -43,7 +43,8 @@ docker build -f docker/Dockerfile.cpu \

 # Launching OpenAI server
 docker run --rm \
-            --privileged=true \
+            --security-opt seccomp=unconfined \
+            --cap-add SYS_NICE \
            --shm-size=4g \
            -p 8000:8000 \
            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE

 #### Install the latest code

-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.

 ```bash
 uv pip install -U vllm \
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@ -149,7 +149,7 @@ Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.

-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```json
 {
@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
 #### Build an image with vLLM

 First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```bash
 {
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@ -1,388 +0,0 @@
-# Intel Gaudi
-
-This page provides instructions on running vLLM with Intel Gaudi devices.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
- OS: Ubuntu 22.04 LTS
- Python: 3.10
- Intel Gaudi accelerator
- Intel Gaudi software version 1.18.0
-
-Please follow the instructions provided in the
-[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-## Configure a new environment
-
-### Environment verification
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-```bash
-hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
-```
-
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-### Run Docker Image
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-```bash
-docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --ipc=host \
-  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-```
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Intel Gaudi wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
-
-```bash
-git clone https://github.com/HabanaAI/vllm-fork.git
-cd vllm-fork
-git checkout habana_main
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Intel Gaudi images.
-
-### Build image from source
-
-```bash
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --rm vllm-hpu-env
-```
-
-!!! tip
-    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-
-## Extra information
-
-### Supported features
-
- [Offline inference](../../serving/offline_inference.md)
- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
- HPU autodetection - no need to manually select device within vLLM
- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-  prefill attention, Root Mean Square Layer Normalization, Rotary
-  Positional Encoding
- Tensor parallelism support for multi-card inference
- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-  for accelerating low-batch latency and throughput
- Attention with Linear Biases (ALiBi)
- INC quantization
-
-### Unsupported features
-
- Beam search
- LoRA adapters
- AWQ quantization
- Prefill chunking (mixed-batch inferencing)
-
-### Supported configurations
-
-The following configurations have been validated to function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-| Model | TP Size| dtype | Sampling |
-|-------|--------|--------|----------|
-| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
-
-## Performance tuning
-
-### Execution modes
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-
-|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
-|----------------------|-------------------|--------------------|
-|                    0 |                 0 | torch.compile      |
-|                    0 |                 1 | PyTorch eager mode |
-|                    1 |                 0 | HPU Graphs         |
-
-!!! warning
-    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-[](){ #gaudi-bucketing-mechanism }
-
-### Bucketing mechanism
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-!!! note
-    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-```text
-INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-| Parameter      | Description                                                                 |
-|----------------|-----------------------------------------------------------------------------|
-| `min`          | Determines the lowest value of the bucket.                                  |
-| `step`         | Determines the interval between buckets.                                     |
-| `max`          | Determines the upper bound of the bucket.                                    |
-| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
-
-Example (with ramp-up):
-
-```text
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up):
-
-```text
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-!!! warning
-    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-
-!!! note
-    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-### Warmup
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-??? console "Logs"
-
-    ```text
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ```
-
-This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-!!! tip
-    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-### HPU Graph capture
-
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-!!! note
-    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-
- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-
-!!! note
-    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-??? console "Logs"
-
-    ```text
-    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-    ...
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-    ```
-
-### Recommended vLLM Parameters
-
- We recommend running inference on Gaudi 2 with `block_size` of 128
-  for BF16 data type. Using default values (16, 32) might lead to
-  sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
- For max throughput on Llama 7B, we recommend running with batch size
-  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-  If you encounter out-of-memory issues, see troubleshooting section.
-
-### Environment variables
-
-**Diagnostic and profiling knobs:**
-
- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
-
-**Performance tuning knobs:**
-
- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-
- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-
- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-
- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-
- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-
- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - `{phase}` is either `PROMPT` or `DECODE`
-
-    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
-
-    - `{param}` is either `MIN`, `STEP` or `MAX`
-
-    - Default values:
-
-| `{phase}` | Parameter | Env Variable | Value Expression |
-|-----------|-----------|--------------|------------------|
-| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
-| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
-| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
-| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
-| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
-| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
-| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
-| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
-| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
-| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
-
-## Troubleshooting: tweaking HPU graphs
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
- Tweak `gpu_memory_utilization` knob. It will decrease the
-  allocation of KV cache, leaving some headroom for capturing graphs
-  with larger batch size. By default `gpu_memory_utilization` is set
-  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-  short profiling run. Note that decreasing reduces the number of KV
-  cache blocks you have available, and therefore reduces the effective
-  maximum number of tokens you can handle at a given time.
- If this method is not efficient, you can disable `HPUGraph`
-  completely. With HPU Graphs disabled, you are trading latency and
-  throughput at lower batches for potentially higher throughput on
-  higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online serving), or by passing `enforce_eager=True`
-  argument to LLM constructor (for offline inference).
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
 import logging
 import sys
 from argparse import SUPPRESS, HelpFormatter
@ -7,25 +8,52 @@ from pathlib import Path
 from typing import Literal
 from unittest.mock import MagicMock, patch

+from pydantic_core import core_schema
+
+logger = logging.getLogger("mkdocs")
+
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"

 sys.path.insert(0, str(ROOT_DIR))
-sys.modules["aiohttp"] = MagicMock()
-sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()

-from vllm.benchmarks import latency  # noqa: E402
-from vllm.benchmarks import serve  # noqa: E402
-from vllm.benchmarks import throughput  # noqa: E402
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.entrypoints.cli.openai import ChatCommand  # noqa: E402
-from vllm.entrypoints.cli.openai import CompleteCommand  # noqa: E402
-from vllm.entrypoints.openai import cli_args  # noqa: E402
-from vllm.entrypoints.openai import run_batch  # noqa: E402
-from vllm.utils import FlexibleArgumentParser  # noqa: E402

-logger = logging.getLogger("mkdocs")
+class PydanticMagicMock(MagicMock):
+    """`MagicMock` that's able to generate pydantic-core schemas."""
+
+    def __get_pydantic_core_schema__(self, source_type, handler):
+        return core_schema.any_schema()
+
+
+def auto_mock(module, attr, max_mocks=50):
+    """Function that automatically mocks missing modules during imports."""
+    logger.info("Importing %s from %s", attr, module)
+    for _ in range(max_mocks):
+        try:
+            # First treat attr as an attr, then as a submodule
+            return getattr(importlib.import_module(module), attr,
+                           importlib.import_module(f"{module}.{attr}"))
+        except importlib.metadata.PackageNotFoundError as e:
+            raise e
+        except ModuleNotFoundError as e:
+            logger.info("Mocking %s for argparse doc generation", e.name)
+            sys.modules[e.name] = PydanticMagicMock()
+
+    raise ImportError(
+        f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
+
+
+latency = auto_mock("vllm.benchmarks", "latency")
+serve = auto_mock("vllm.benchmarks", "serve")
+throughput = auto_mock("vllm.benchmarks", "throughput")
+AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
+EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
+ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
+CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
+cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")


 class MarkdownFormatter(HelpFormatter):
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@ -70,6 +70,10 @@ class Example:
        self.other_files = self.determine_other_files()
        self.title = self.determine_title()

+    @property
+    def is_code(self) -> bool:
+        return self.main_file.suffix != ".md"
+
    def determine_main_file(self) -> Path:
        """
        Determines the main file in the given path.
@ -101,6 +105,12 @@ class Example:
        return [file for file in self.path.rglob("*") if is_other_file(file)]

    def determine_title(self) -> str:
+        if not self.is_code:
+            with open(self.main_file) as f:
+                first_line = f.readline().strip()
+            match = re.match(r'^#\s+(?P<title>.+)$', first_line)
+            if match:
+                return match.group('title')
        return fix_case(self.path.stem.replace("_", " ").title())

    def generate(self) -> str:
@ -110,11 +120,13 @@ class Example:
        # Use long code fence to avoid issues with
        # included files containing code fences too
        code_fence = "``````"
-        is_code = self.main_file.suffix != ".md"
-        if is_code:
+        # Skip the title from md snippets as it's been included above
+        start_line = 2
+        if self.is_code:
            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-        content += f'--8<-- "{self.main_file}"\n'
-        if is_code:
+            start_line = 1
+        content += f'--8<-- "{self.main_file}:{start_line}"\n'
+        if self.is_code:
            content += f"{code_fence}\n"
        content += "\n"

--- a/docs/mkdocs/javascript/mathjax.js
+++ b/docs/mkdocs/javascript/mathjax.js
@ -0,0 +1,20 @@
+// Enables MathJax rendering
+window.MathJax = {
+  tex: {
+    inlineMath: [["\\(", "\\)"]],
+    displayMath: [["\\[", "\\]"]],
+    processEscapes: true,
+    processEnvironments: true
+  },
+  options: {
+    ignoreHtmlClass: ".*|",
+    processHtmlClass: "arithmatex"
+  }
+};
+
+document$.subscribe(() => { 
+  MathJax.startup.output.clearCache()
+  MathJax.typesetClear()
+  MathJax.texReset()
+  MathJax.typesetPromise()
+})
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
 ## Offline Inference

 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.

 ### `LLM.generate`

--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference

 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.

 ### `LLM.embed`

@ -205,12 +205,12 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides

 There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.

-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).

 Here is an example to serve a model with Matryoshka Embeddings enabled.

 ```text
-vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```

 ### Offline Inference
@ -258,4 +258,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```

-A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
--- a/Show More
+++ b/Show More