[Quantization/NVFP4] Speed up TRTLLM NVFP4 MOE weight loading and fix K/V scale loading for MLA Attn (#25968 )

Signed-off-by: Pavani Majety <pmajety@nvidia.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
Fix V1 engine serialization error with Ray distributed executor (#26148 )
2025-10-03 13:35:58 -07:00 · 2025-10-03 13:35:58 -07:00 · 2025-10-03 13:35:58 -07:00 · 2025-10-03 13:35:58 -07:00 · 2025-10-03 13:35:58 -07:00 · 2025-10-03 13:35:58 -07:00
1840 changed files with 117922 additions and 165235 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -1,11 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.90
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -1,11 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "mmlu_pro"
  metrics:
  - name: "exact_match,custom-extract"
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,5 +1,4 @@
-# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.855
 limit: 2500
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@ -1 +0,0 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -1 +0,0 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -1 +0,0 @@
 Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -1,44 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.9
 usage() {
    echo``
    echo "Runs lm eval harness on ChartQA using multimodal vllm."
    echo "This pathway is intended to be used to create baselines for "
    echo "our correctness tests in vllm's CI."
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:l:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    t ) 
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm-vlm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -1,50 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on MMLUPRO for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
    f )
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,27 +19,21 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len},"
+        f"max_model_len={max_model_len}"
    )
    results = lm_eval.simple_evaluate(
-        model=backend,
+        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        batch_size="auto",
        # text models. however, this is regressing measured strict-match for
        # existing text models in CI, so only apply it for mm.
        apply_chat_template=backend == "vllm-vlm",
        batch_size=batch_size,
    )
    return results
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -368,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )
    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -454,6 +454,11 @@ main() {
  fi
  check_hf_token
  # Set to v1 to run v1 benchmark
  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
    export VLLM_USE_V1=1
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -48,7 +48,7 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
@ -150,16 +150,11 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      - "docker push vllm/vllm-openai:nightly-aarch64"
      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest push vllm/vllm-openai:nightly"
      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -168,4 +163,3 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,41 +8,20 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
-# Get DockerHub credentials from environment
+# Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 if [ -z "$DOCKERHUB_USERNAME" ]; then
    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
    exit 1
 fi
 # Get DockerHub bearer token
 echo "Getting DockerHub bearer token..."
 set +x
 BEARER_TOKEN=$(curl -s -X POST \
    -H "Content-Type: application/json" \
    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
    "https://hub.docker.com/v2/users/login" | jq -r '.token')
 set -x
 if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
    echo "Error: Failed to get DockerHub bearer token"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
-        set +x
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        set -x
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
    set -x
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -25,28 +25,25 @@ function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
-    set -xve
+    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  podman exec -it "$container_id" bash -c "
-    set -evx
+    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
-timeout 120m bash -c cpu_tests
+timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -44,5 +44,6 @@ docker run \
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=8.7
+EXPECTED_THROUGHPUT=10.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-vllm serve $MODEL \
+VLLM_USE_V1=1 vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -296,7 +296,6 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
@ -318,7 +317,7 @@ steps:
  no_gpu: true
  commands:
    # split the test to avoid interference
-    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@ -398,12 +397,12 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
    - pytest -v -s compile/test_aot_compile.py
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -432,9 +431,8 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@ -478,7 +476,6 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba
@ -527,9 +524,8 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  # TODO(jerryzh168): resolve the above comment
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - uv pip install --system torchao==0.13.0
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -734,16 +730,6 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@ -840,20 +826,18 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval
+- label: GPT-OSS Eval (Blackwell)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # run on nightlies
+  optional: true # disable while debugging
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@ -880,16 +864,6 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -1106,8 +1080,6 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.coveragerc
+++ b/.coveragerc
@ -1,10 +1,5 @@
 [run]
-# Track the installed vllm package (this is what actually gets imported during tests)
+source = vllm
 # Use wildcard pattern to match the installed location
 source =
    vllm
    */dist-packages/vllm
    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -17,16 +12,6 @@ omit =
    */benchmarks/*
    */docs/*
 [paths]
 # Map all possible vllm locations to a canonical "vllm" path
 # This ensures coverage.combine properly merges data from different test runs
 source =
    vllm
    /vllm-workspace/src/vllm
    /vllm-workspace/vllm
    */site-packages/vllm
    */dist-packages/vllm
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -1,4 +0,0 @@
 # Migrate from `yapf` & `isort` to `ruff`
 d6953beb91da4e9c99be4c0a1304a2d24189535c
 # Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
 8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,7 +5,9 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
@ -21,9 +23,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -57,7 +59,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC
 # Transformers backend
-/vllm/model_executor/models/transformers @hmellor
+/vllm/model_executor/models/transformers.py @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
@ -118,11 +120,3 @@ mkdocs.yaml @hmellor
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
 # Pooling models
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -11,8 +11,6 @@ pull_request_rules:
    label:
      add:
        - documentation
    comment:
      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 - name: label-ci-build
  description: Automatically apply ci/build label
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,7 +13,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -43,6 +42,7 @@ jobs:
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,12 +89,14 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -105,17 +107,13 @@ jobs:
                  }
                ],
              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
              //   substrings: [...],
              //   regexPatterns: [...]
              // },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -145,17 +146,21 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ?
+                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
-            const labelsAddedResults = await Promise.all(
+            const processLabels = Object.entries(labelConfig)
-              Object.entries(labelConfig).map(([labelName, config]) => 
+              .map(([labelName, config]) => processLabel(labelName, config));
-                processLabel(labelName, config).then(added => ({ labelName, added }))
+            const labelsAdded = await Promise.all(processLabels);
-              )
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            );
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
            // Return which labels were added for the next step
            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
            core.setOutput('labels_added', JSON.stringify(addedLabels));
            return addedLabels;
      - name: CC users for labeled issues
        if: steps.label-step.outputs.labels_added != '[]'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
            // Configuration: Map labels to GitHub users to CC
            // You can add multiple users per label, and multiple label configurations
            const ccConfig = {
              rocm: {
                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
              },
              // Add more label -> user mappings here
              // Example:
              // cuda: {
              //   users: ['user1', 'user2'],
              //   message: 'CC {users} for CUDA-related issue'
              // },
              // performance: {
              //   users: ['perfexpert'],
              //   message: 'CC {users} for performance issue'
              // },
            };
            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
            // Get existing comments to check for already mentioned users
            const comments = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
            });
            const issueBody = context.payload.issue.body || '';
            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
            // Process each label that was added
            for (const label of labelsAdded) {
              if (ccConfig[label]) {
                const config = ccConfig[label];
                const usersToMention = [];
                // Check which users haven't been mentioned yet
                for (const user of config.users) {
                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
                  if (!mentionPattern.test(allExistingText)) {
                    usersToMention.push(user);
                  } else {
                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
                  }
                }
                // Post comment if there are users to mention
                if (usersToMention.length > 0) {
                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
                  const message = config.message.replace('{users}', mentions);
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: context.issue.number,
                    body: message
                  });
                  core.notice(`CC comment added for label "${label}": ${mentions}`);
                } else {
                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
                }
              }
            }
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,19 +6,30 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
+- repo: https://github.com/google/yapf
-  rev: v0.14.0
+  rev: v0.43.0
  hooks:
-  - id: ruff-check
+  - id: yapf
    args: [--in-place, --verbose]
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.35.5
  hooks:
  - id: typos
-    args: [--force-exclude]
+- repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v21.1.2
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -35,7 +46,7 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.9.1
+  rev: 0.6.17
  hooks:
    - id: pip-compile
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
@ -56,6 +67,11 @@ repos:
      types_or: [python, pyi]
      require_serial: true
      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
    entry: python tools/pre_commit/mypy.py 1 "3.9"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@ -71,11 +87,6 @@ repos:
    entry: python tools/pre_commit/mypy.py 1 "3.12"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.13
    entry: python tools/pre_commit/mypy.py 1 "3.13"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
    entry: tools/shellcheck.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
@ -1007,7 +1007,6 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
    include(cmake/external_projects/qutlass.cmake)
    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)
--- a/README.md
+++ b/README.md
@ -149,7 +149,6 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
 - Volcengine
 Slack Sponsor: Anyscale
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -74,7 +74,7 @@ start_server() {
    local vllm_log=$4
    local profile_dir=$5
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    # Define the common arguments as a bash array.
    # Each argument and its value are separate elements.
@ -96,11 +96,11 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
-        VLLM_SERVER_DEV_MODE=1 \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
    local server_pid=$!
@ -139,7 +139,7 @@ run_benchmark() {
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +232,7 @@ run_benchmark() {
    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    sleep 10
    echo "===================="
    return 0
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 else
    echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
-pkill -if "vllm serve" || true
+pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,6 +8,7 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
 from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: str | None = None
+    model_name: Optional[str] = None
-    logprobs: int | None = None
+    logprobs: Optional[int] = None
-    extra_body: dict | None = None
+    extra_body: Optional[dict] = None
-    multi_modal_content: dict | list[dict] | None = None
+    multi_modal_content: Optional[dict | list[dict]] = None
    ignore_eos: bool = False
-    language: str | None = None
+    language: Optional[str] = None
-    request_id: str | None = None
+    request_id: Optional[str] = None
@dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:
 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(
 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(
 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(
 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 from benchmark_utils import TimeCollector
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -5,9 +5,9 @@ import time
 from unittest import mock
 import numpy as np
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 from benchmark_utils import TimeCollector
 from vllm.config import (
    CacheConfig,
    DeviceConfig,
@ -164,7 +164,7 @@ def invoke_main() -> None:
    )
    parser.add_argument(
        "--batched", action="store_true", help="consider time to prepare batch"
-    )
+    )  # noqa: E501
    parser.add_argument(
        "--num-iteration",
        type=int,
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,6 +32,7 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import PreTrainedTokenizerBase
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,6 +7,7 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -23,7 +24,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,19 +31,20 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -316,7 +317,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -436,9 +437,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: int | None,
+    max_concurrency: Optional[int],
    structured_output_ratio: float,
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -502,9 +503,15 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
+    # This can be used once the minimum Python version is 3.10 or higher,
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
@ -903,13 +910,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any
+from typing import Any, Optional, Union
 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: int | None = None
+        self._max: Optional[int] = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)
-    def avg(self) -> float | str:
+    def avg(self) -> Union[float, str]:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
-    def max(self) -> float | str:
+    def max(self) -> Union[float, str]:
        return self._max / self.scale if self._max else "N/A"
-    def dump_avg_max(self) -> list[float | str]:
+    def dump_avg_max(self) -> list[Union[float, str]]:
        return [self.avg(), self.max()]
    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
    def __exit__(
        self,
-        exc_type: type[BaseException] | None,
+        exc_type: Optional[type[BaseException]],
-        exc_value: BaseException | None,
+        exc_value: Optional[BaseException],
-        exc_traceback: TracebackType | None,
+        exc_traceback: Optional[TracebackType],
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from typing import Callable
 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -52,7 +53,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -107,7 +108,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -182,7 +183,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,9 +3,10 @@
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_mxfp4_qutlass.py
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@ -1,191 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import argparse
 import copy
 import itertools
 import torch
 from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 from weight_shapes import WEIGHT_SHAPES
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.triton_utils import triton
 PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "mxfp4": dict(no_a_quant=False, enabled=True),
    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
 }
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
    return (
        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
        * group_size**-0.5
    )
 def _quant_weight_mxfp4(
    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
 ):
    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
        b, forward_hadamard_matrix, method="abs_max"
    )
    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
    return weight_hf_e2m1, weight_hf_scale_block
 def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
        b, forward_hadamard_matrix, device
    )
    alpha = torch.tensor([1.0], device="cuda")
    if cfg["no_a_quant"]:
        # Pre-quantize activation
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
            a, forward_hadamard_matrix, method="abs_max"
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
        def run():
            return matmul_mxf4_bf16_tn(
                input_hf_e2m1,
                weight_hf_e2m1,
                input_hf_scale_block,
                weight_hf_scale_block,
                alpha,
            )
        return run
    # Quantize activation on-the-fly
    def run():
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
            a, forward_hadamard_matrix, method="abs_max"
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
        return matmul_mxf4_bf16_tn(
            input_hf_e2m1,
            weight_hf_e2m1,
            input_hf_scale_block,
            weight_hf_scale_block,
            alpha,
        )
    return run
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[
            1,
            4,
            8,
            16,
            32,
            64,
            128,
            256,
            512,
            1024,
            2048,
            4096,
            8192,
            16384,
            24576,
            32768,
        ],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs MXFP4 GEMMs",
        args={},
    )
 )
 def benchmark(batch_size, provider, N, K, had_size):
    M = batch_size
    device = "cuda"
    dtype = torch.bfloat16
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)
    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
        )
    else:
        cfg = PROVIDER_CFGS[provider]
        run_quant = build_mxfp4_runner(
            cfg, a, b, forward_hadamard_matrix, dtype, device
        )
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), rep=200, quantiles=quantiles
        )
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 def prepare_shapes(args):
    out = []
    for model, tp_size in itertools.product(args.models, args.tp_sizes):
        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
            KN[tp_dim] //= tp_size
            KN.append(model)
            out.append(KN)
    return out
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()
    for K, N, model in prepare_shapes(args):
        for had_size in [32, 64, 128]:
            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
            benchmark.run(
                print_data=True,
                show_plots=True,
                save_path=f"bench_mxfp4_res_n{N}_k{K}",
                N=N,
                K=K,
                had_size=had_size,
            )
    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_nvfp4_qutlass.py
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@ -1,207 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import argparse
 import copy
 import itertools
 import torch
 from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 from vllm._custom_ops import fusedQuantizeNv
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.triton_utils import triton
 PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
 }
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
    return (
        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
        * group_size**-0.5
    )
 def _quant_weight_nvfp4(
    b: torch.Tensor,
    forward_hadamard_matrix: torch.Tensor,
    global_scale: torch.Tensor,
    device: str,
    M: int,
    N: int,
    K: int,
 ):
    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
        b, forward_hadamard_matrix, global_scale
    )
    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
        -1, K // 16
    )
    return weight_hf_e2m1, weight_hf_scale_block
 def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
    alpha = torch.tensor([1.0], device="cuda")
    global_scale = torch.tensor([1.0], device="cuda")
    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
        b, forward_hadamard_matrix, global_scale, device, M, N, K
    )
    if cfg["no_a_quant"]:
        # Pre-quantize activation
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
            a, forward_hadamard_matrix, global_scale
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
            -1, K // 16
        )
        def run():
            return ops.cutlass_scaled_fp4_mm(
                input_hf_e2m1,
                weight_hf_e2m1,
                input_hf_scale_block,
                weight_hf_scale_block,
                alpha,
                torch.bfloat16,
            )
        return run
    # Quantize activation on-the-fly
    def run():
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
            a, forward_hadamard_matrix, global_scale
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
            -1, K // 16
        )
        return ops.cutlass_scaled_fp4_mm(
            input_hf_e2m1,
            weight_hf_e2m1,
            input_hf_scale_block,
            weight_hf_scale_block,
            alpha,
            torch.bfloat16,
        )
    return run
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[
            1,
            4,
            8,
            16,
            32,
            64,
            128,
            256,
            512,
            1024,
            2048,
            4096,
            8192,
            16384,
            24576,
            32768,
        ],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs NVFP4 GEMMs",
        args={},
    )
 )
 def benchmark(batch_size, provider, N, K, had_size):
    M = batch_size
    device = "cuda"
    dtype = torch.bfloat16
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)
    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
        )
    else:
        cfg = PROVIDER_CFGS[provider]
        run_quant = build_nvfp4_runner(
            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
        )
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), rep=200, quantiles=quantiles
        )
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 def prepare_shapes(args):
    out = []
    for model, tp_size in itertools.product(args.models, args.tp_sizes):
        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
            KN[tp_dim] //= tp_size
            KN.append(model)
            out.append(KN)
    return out
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()
    for K, N, model in prepare_shapes(args):
        for had_size in [16, 32, 64, 128]:
            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
            benchmark.run(
                print_data=True,
                show_plots=True,
                save_path=f"bench_nvfp4_res_n{N}_k{K}",
                N=N,
                K=K,
                had_size=had_size,
            )
    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from collections.abc import Callable
+from typing import Callable
 from unittest.mock import patch
 import pandas as pd
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
 from collections.abc import Callable
 from contextlib import nullcontext
 from typing import Callable, Optional
 import torch
 import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> float | None:
+    ) -> Optional[float]:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,12 +6,11 @@ import copy
 import json
 import pickle
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -159,7 +158,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: bool | None,
+    add_inputs: Optional[bool],
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -317,8 +316,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: int | None = None
+    seq_length: Optional[int] = None
-    num_slices: int | None = None  # num_slices for slice based ops
+    num_slices: Optional[int] = None  # num_slices for slice based ops
    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
        }
    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: bool | None = None
+        self, op_type: OpType, add_inputs: Optional[bool] = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")
    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: bool | None
+        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
+    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: bool | None = None,
+    expand_fn_add_inputs: Optional[bool] = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
+    cuda_graph_nops: Optional[int] = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
            """
-def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
+def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
    compare = TBenchmark.Compare(timers)
    compare.print()
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,9 +8,10 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import pandas as pd
 import torch
@ -62,23 +63,23 @@ class BenchmarkTensors:
    a: torch.Tensor
    w_q: torch.Tensor
-    group_size: int | None
+    group_size: Optional[int]
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: torch.Tensor | None
+    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: torch.Tensor | None
+    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: torch.Tensor | None
+    w_tok_s: Optional[torch.Tensor]
@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: torch.dtype | None
+    output_type: Optional[torch.dtype]
-    group_scale_type: torch.dtype | None
+    group_scale_type: Optional[torch.dtype]
-    group_zero_type: torch.dtype | None
+    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: torch.dtype | None
+    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: torch.dtype | None
+    token_scale_type: Optional[torch.dtype]
 def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: torch.dtype | None,
+    stype: Optional[torch.dtype],
-    group_size: int | None,
+    group_size: Optional[int],
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(
 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
+    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
 ) -> list[BenchmarkTensors]:
    m, n, k = shape
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res
-_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
 def bench(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -579,12 +579,10 @@ def main(args: argparse.Namespace):
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        hidden_size = config.hidden_size
    elif config.architectures[0] == "JambaForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] in (
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
@ -594,7 +592,6 @@ def main(args: argparse.Namespace):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] in (
        "Qwen2MoeForCausalLM",
        "Qwen3MoeForCausalLM",
@ -603,18 +600,10 @@ def main(args: argparse.Namespace):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
        text_config = config.get_text_config()
        E = text_config.num_experts
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
    else:
        # Support for llama4
        config = config.get_text_config()
@ -622,7 +611,6 @@ def main(args: argparse.Namespace):
        E = config.num_local_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
@ -631,7 +619,8 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    hidden_size = config.hidden_size
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -3,6 +3,7 @@
 import random
 import time
 from typing import Optional
 import torch
@ -36,7 +37,7 @@ def main(
    seed: int,
    do_profile: bool,
    device: str = "cuda",
-    kv_cache_dtype: str | None = None,
+    kv_cache_dtype: Optional[str] = None,
 ) -> None:
    current_platform.seed_everything(seed)
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@ -3,8 +3,8 @@
 import argparse
 import math
 from collections.abc import Callable
 from contextlib import contextmanager
 from typing import Callable
 from unittest.mock import patch
 import torch
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from typing import Optional, Union
 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        residual: torch.Tensor | None = None,
+        residual: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -64,7 +65,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from itertools import accumulate
 from typing import Optional
 import nvtx
 import torch
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: int | None,
+    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,19 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Comprehensive 3-way SiLU Benchmark Suite
 This benchmark compares three SiLU implementations:
 1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
 2. Triton Kernel - Triton-based implementation
 The suite generates detailed performance comparisons including:
 - Memory bandwidth utilization
 - Speedup ratios (baseline vs optimized implementations)
 - Performance across different expert configurations and token distributions
 """
 from collections.abc import Callable
 import matplotlib.pyplot as plt
@ -21,7 +7,7 @@ import numpy as np
 import torch
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    persistent_masked_m_silu_mul_quant,
+    silu_mul_fp8_quant_deep_gemm_cuda,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@ -108,7 +94,6 @@ def silu_mul_fp8_quant_deep_gemm_triton(
    num_parallel_tokens,
    group_size: int = 128,
    eps: float = 1e-10,
    expert_offsets: torch.Tensor = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -189,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
 # Parse generation strategies
-strategies = ["random_imbalanced", "uniform", "max_t"]
+strategies = ["uniform", "max_t", "first_t"]
 def benchmark(
@ -210,27 +195,15 @@ def benchmark(
        current_platform.seed_everything(42 + seed_offset)
        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
-        if gen_strategy == "random_imbalanced":
+        if gen_strategy == "uniform":
-
+            r = torch.rand(size=(E,), device="cuda")
            def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
                mean = total_tokens // n_e
                min_max = mean // ratio
                e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
                e[0] = min_max
                r = torch.rand(size=(E - 1,))
                r /= r.sum()
                r *= total_tokens - min_max
                r = r.round().long()
                e[1:] = r.to(device=device)
                return e
            tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
        elif gen_strategy == "uniform":
            r = torch.rand(size=(E,))
            r /= r.sum()
            r *= total_tokens
-            r = r.round().long()
+            tokens_per_expert = r.int()
-            tokens_per_expert = r
+            tokens_per_expert = torch.minimum(
                tokens_per_expert,
                torch.ones((E,), device=r.device, dtype=torch.int) * T,
            )
        elif gen_strategy == "max_t":
            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
            tokens_per_expert.fill_(total_tokens / E)
@ -308,34 +281,40 @@ def benchmark(
 def create_comparison_plot(
-    ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
+    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
 ):
-    fig, ax = plt.subplots(1, 1, figsize=(18, 6))
+    """Create a comparison plot for a specific generation strategy"""
    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
    # Configure x-axis positions
    x = np.arange(len(config_labels))
-    width = 0.25
+    width = 0.35
    # Execution Time plot (lower is better)
    ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
    ax.bar(
-        x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
+        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
    )
    ax.bar(
        x + width / 2,
        baseline_times,
        width,
        label="Baseline",
        alpha=0.8,
        color="orange",
    )
-    # Add speedup labels over each bar trio
+    # Add speedup labels over each bar pair
    for i in range(len(x)):
-        triton_v2_speedup = ratios[i][1]  # triton/v2
+        speedup = ratio[i]
-        max_height = max(silu_v2_times[i], triton_times[i])
+        max_height = max(cuda_times[i], baseline_times[i])
        # Triton/V2 speedup
        ax.text(
-            x[i] + width / 2,
+            x[i],
            max_height + max_height * 0.02,
-            f"{triton_v2_speedup:.2f}x",
+            f"{speedup:.2f}x",
            ha="center",
            va="bottom",
            fontweight="bold",
-            fontsize=8,
+            fontsize=9,
        )
    ax.set_xlabel("Configuration")
@ -353,75 +332,56 @@ def create_comparison_plot(
 def create_combined_plot(all_results):
    """Create a combined plot with all strategies in one PNG"""
    num_strategies = len(all_results)
-    fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
    if num_strategies == 1:
        axes = [axes]
    for idx, (
        strategy_name,
-        all_ratios,
+        ratio,
-        all_silu_v2_results,
+        cuda_times,
-        all_triton_results,
+        baseline_times,
        config_labels,
        config_x_axis,
    ) in enumerate(all_results):
        ax = axes[idx]
        # Flatten the nested results to get bandwidth percentages for plotting
        silu_v2_bandwidths = []
        triton_bandwidths = []
        flat_ratios = []
        for config_results in all_silu_v2_results:
            for result in config_results:
                silu_v2_bandwidths.append(result[3])  # bandwidth percentage
        for config_results in all_triton_results:
            for result in config_results:
                triton_bandwidths.append(result[3])  # bandwidth percentage
        for config_ratios in all_ratios:
            for ratio in config_ratios:
                flat_ratios.append(ratio)
        # Configure x-axis positions
        x = np.arange(len(config_labels))
-        width = 0.25
+        width = 0.35
-        # Bandwidth utilization plot (higher is better)
+        # Execution Time plot (lower is better)
        ax.bar(
-            x,
+            x - width / 2,
-            silu_v2_bandwidths,
+            cuda_times,
            width,
-            label="SiLU V2 (CUDA)",
+            label="CUDA Kernel",
            alpha=0.8,
            color="blue",
        )
        ax.bar(
-            x + width,
+            x + width / 2,
-            triton_bandwidths,
+            baseline_times,
            width,
-            label="Triton Kernel",
+            label="Baseline",
            alpha=0.8,
-            color="green",
+            color="orange",
        )
-        # Add speedup labels over each bar trio
+        # Add speedup labels over each bar pair
        for i in range(len(x)):
-            triton_v2_speedup = flat_ratios[i]  # triton/v2
+            speedup = ratio[i]
-            max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+            max_height = max(cuda_times[i], baseline_times[i])
            # Triton/V2 speedup
            ax.text(
-                x[i] + width / 2,
+                x[i],
                max_height + max_height * 0.02,
-                f"{triton_v2_speedup:.2f}x",
+                f"{speedup:.2f}x",
                ha="center",
                va="bottom",
                fontweight="bold",
-                fontsize=8,
+                fontsize=9,
            )
        ax.set_xlabel("Configuration")
@ -435,7 +395,7 @@ def create_combined_plot(all_results):
        ax.grid(True, alpha=0.3)
    plt.tight_layout()
-    filename = "silu_benchmark_combined_3way.png"
+    filename = "../../silu_bench/silu_benchmark_combined.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()
@ -445,9 +405,7 @@ def create_combined_plot(all_results):
 outer_dim = 7168
 configs = [
    # DeepSeekV3 Configs
    # (1, 56, 7168),
    (8, 1024, 7168),
    # (32, 56, 7168),
    # DeepSeekV3 Configs
    (32, 1024, 7168),
    # DeepSeekV3 Configs
@ -459,7 +417,6 @@ num_warmups = 20
 strategy_descriptions = {
    "uniform": "Uniform Random",
    "random_imbalanced": "Imbalanced Random",
    "max_t": "Even Assignment",
    "first_t": "experts[0] = T, experts[1:] = 0",
 }
@ -476,31 +433,28 @@ for id, strategy in enumerate(strategies):
    print(f"Testing strategy: {strategy_descriptions[strategy]}")
    print(f"{'=' * 60}")
-    # Collect benchmark data for all three algorithms
+    # Collect benchmark data for both algorithms
    config_labels = []
    config_x_axis = []
-    all_silu_v2_results = []
+    all_cuda_results = []
-    all_triton_results = []
+    all_baseline_results = []
    all_ratios = []
    for E, T, H in configs:
-        total_tokens_config = []
+        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
        for i in [8, 16, 32, 64, 128, 256, 512]:
            if i <= T:
                total_tokens_config.append(i * E)
        config_x_axis.append(total_tokens_config)
-        silu_v2_results = []
+        cuda_results = []
-        triton_results = []
+        baseline_results = []
        ratios = []
        for total_tokens in total_tokens_config:
            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
            config_labels.append(config_label)
-            # SiLU V2 (CUDA kernel) results
+            # CUDA kernel results
-            time_ms_silu_v2, gflops, gbps, perc = benchmark(
+            time_ms_cuda, gflops, gbps, perc = benchmark(
-                persistent_masked_m_silu_mul_quant,
+                silu_mul_fp8_quant_deep_gemm_cuda,
                E,
                T,
                H,
@ -509,9 +463,9 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
+            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
-            # Triton kernel results
+            # Baseline results
            time_ms_triton, gflops, gbps, perc = benchmark(
                silu_mul_fp8_quant_deep_gemm_triton,
                E,
@ -522,20 +476,12 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            triton_results.append((time_ms_triton, gflops, gbps, perc))
+            baseline_results.append((time_ms_triton, gflops, gbps, perc))
            ratios.append(time_ms_triton / time_ms_cuda)
-            # Calculate speedup ratios (triton baseline / implementation)
+            print(f"Completed: {config_label}")
-            triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+        all_cuda_results.append(cuda_results)
-            ratios.append(triton_v2_ratio)
+        all_baseline_results.append(baseline_results)
            print(
                f"Completed: {config_label}:"
                f" V2: {time_ms_silu_v2:.3f}ms,"
                f" Triton: {time_ms_triton:.3f}ms"
            )
        all_silu_v2_results.append(silu_v2_results)
        all_triton_results.append(triton_results)
        all_ratios.append(ratios)
    # Store results for combined plotting
@ -543,8 +489,8 @@ for id, strategy in enumerate(strategies):
        (
            strategy_descriptions[strategy],
            all_ratios,
-            all_silu_v2_results,
+            all_cuda_results,
-            all_triton_results,
+            all_baseline_results,
            config_labels,
            config_x_axis,
        )
@ -552,18 +498,15 @@ for id, strategy in enumerate(strategies):
    # Print summary table for this strategy
    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
-    print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
-    print("-" * 90)
+    print("-" * 60)
    for i, (E, T, H) in enumerate(configs):
-        # Get the first result for each config (simplifying for summary)
+        speedup = baseline_results[i][0] / cuda_results[i][0]
        v2_time = silu_v2_results[i][0]
        triton_time = triton_results[i][0]
        triton_v2_speedup = triton_time / v2_time
        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
        print(
-            f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
-            f"{triton_v2_speedup:8.2f}x"
+            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
        )
@ -571,14 +514,15 @@ def create_total_tokens_plot(all_results):
    num_strategies = len(all_results)
    num_configs = len(configs)
    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
    fig, axs = plt.subplots(
-        num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
+        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
    )
    # Add main title to the entire figure
    fig.suptitle(
-        "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
-        fontsize=18,
+        fontsize=16,
        fontweight="bold",
        y=0.98,
    )
@ -595,8 +539,8 @@ def create_total_tokens_plot(all_results):
        (
            strategy_name,
            all_ratios,
-            all_silu_v2_results,
+            all_cuda_results,
-            all_triton_results,
+            all_baseline_results,
            config_labels,
            config_x_axis,
        ) = result
@ -611,54 +555,42 @@ def create_total_tokens_plot(all_results):
            ratios = all_ratios[config_idx]
            total_tokens_values = config_x_axis[config_idx]
-            # Extract speedup ratios
+            # Extract CUDA and Triton bandwidth percentages
-            triton_v2_ratios = [ratio for ratio in ratios]
+            cuda_bandwidth_percentages = [
-
+                result[3] for result in all_cuda_results[config_idx]
            # Extract bandwidth percentages for all implementations
            v2_bandwidth_percentages = [
                result[3] for result in all_silu_v2_results[config_idx]
            ]
            triton_bandwidth_percentages = [
-                result[3] for result in all_triton_results[config_idx]
+                result[3] for result in all_baseline_results[config_idx]
            ]
            # Plot speedup ratios vs total tokens (left plot)
            ax_speedup.plot(
-                total_tokens_values,
+                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
                triton_v2_ratios,
                "go-",
                linewidth=3,
                markersize=8,
                label="Triton/V2 Speedup",
            )
            ax_speedup.set_title(
-                f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
+                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
                fontsize=12,
                fontweight="bold",
            )
            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
            ax_speedup.legend(prop={"weight": "bold"})
            ax_speedup.grid(True, alpha=0.3)
            # Plot bandwidth utilization (right plot)
            ax_bandwidth.plot(
                total_tokens_values,
-                v2_bandwidth_percentages,
+                cuda_bandwidth_percentages,
-                "o-",
+                "ro-",
                linewidth=3,
                markersize=8,
-                label="SiLU V2",
+                label="CUDA",
                color="blue",
            )
            ax_bandwidth.plot(
                total_tokens_values,
                triton_bandwidth_percentages,
-                "o-",
+                "go-",
                linewidth=3,
                markersize=8,
                label="Triton",
                color="green",
            )
            ax_bandwidth.set_title(
                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -686,12 +618,38 @@ def create_total_tokens_plot(all_results):
                for label in ax.get_xticklabels() + ax.get_yticklabels():
                    label.set_fontweight("bold")
-            # Add value labels on Triton/V2 speedup points
+            # Add value labels on speedup points
-            for x, y in zip(total_tokens_values, triton_v2_ratios):
+            for x, y in zip(total_tokens_values, ratios):
                ax_speedup.annotate(
                    f"{y:.2f}x",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, 12),
                    ha="center",
                    fontsize=10,
                    fontweight="bold",
                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
                )
            # Add value labels on CUDA bandwidth points
            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
                ax_bandwidth.annotate(
                    f"{y:.1f}%",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, 12),
                    ha="center",
                    fontsize=9,
                    fontweight="bold",
                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
                )
            # Add value labels on Triton bandwidth points
            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
                ax_bandwidth.annotate(
                    f"{y:.1f}%",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, -15),
                    ha="center",
                    fontsize=9,
@ -701,20 +659,17 @@ def create_total_tokens_plot(all_results):
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)  # Make room for main title
-    filename = "silu_benchmark_total_tokens_3way.png"
+    filename = "silu_benchmark_total_tokens.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()
    return filename
-# Create comprehensive 3-way comparison plots
+# Create combined plot with all strategies
-combined_plot_filename = create_combined_plot(all_results)
+combined_plot_filename = create_total_tokens_plot(all_results)
 total_tokens_plot_filename = create_total_tokens_plot(all_results)
-print(f"\n{'=' * 80}")
+print(f"\n{'=' * 60}")
-print("3-Way Benchmark Suite Complete!")
+print("Benchmark Complete!")
-print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated combined plot: {combined_plot_filename}")
-print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print(f"{'=' * 60}")
 print("Compared: SiLU V2 (CUDA), and Triton implementations")
 print(f"{'=' * 80}")
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -4,6 +4,7 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_decode(
    dtype: torch.dtype,
-    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    quant_dtypes: tuple[
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -4,6 +4,7 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_prefill(
    dtype: torch.dtype,
-    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    quant_dtypes: tuple[
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -14,7 +14,7 @@ import torch
 from tqdm import tqdm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_triton_block_scaled_mm,
+    _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
@ -83,7 +83,7 @@ def w8a8_block_matmul(
        )
    if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_triton_block_scaled_mm
+        kernel = _w8a8_block_fp8_matmul
    else:
        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
@ -19,21 +20,19 @@ from vllm.utils.deep_gemm import (
 )
-def benchmark_shape(
+def benchmark_shape(m: int,
-    m: int,
+                    n: int,
-    n: int,
+                    k: int,
-    k: int,
+                    warmup: int = 100,
-    warmup: int = 100,
+                    repeat: int = 10000,
-    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
    verbose: bool = False,
 ) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
    # Create test tensors
-    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
-    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
    # Reference result in BF16
    torch.cuda.synchronize()
@ -50,39 +49,34 @@ def benchmark_shape(
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True
+        A, block_size[1], column_major_scales=True)
    )
    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        fp8_gemm_nt(
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
-            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+                                       (B_deepgemm, B_scale_deepgemm),
-        )
+                                       C_deepgemm)
        return C_deepgemm
    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(
+        return w8a8_triton_block_scaled_mm(A_vllm,
-            A_vllm,
+                                     B_vllm,
-            B_vllm,
+                                     A_scale_vllm,
-            A_scale_vllm,
+                                     B_scale_vllm,
-            B_scale_vllm,
+                                     block_size,
-            block_size,
+                                     output_dtype=torch.bfloat16)
            output_dtype=torch.bfloat16,
        )
    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
-            A_vllm_cutlass,
+                                     B_vllm.T,
-            B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
-            scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
-            scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
            out_dtype=torch.bfloat16,
        )
    # Run correctness check first
    if verbose:
@ -99,23 +93,26 @@ def benchmark_shape(
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print(
+        print("vLLM Triton vs DeepGEMM difference: "
-            "vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
-            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        print("vLLM CUTLASS vs DeepGEMM difference: "
-        )
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
        print(
            "vLLM CUTLASS vs DeepGEMM difference: "
            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
        )
    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
    }
-    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
+    benchmark_results = {
        "shape": {
            "m": m,
            "n": n,
            "k": k
        },
        "implementations": {}
    }
    for name, func in implementations.items():
        # Warmup
@ -143,36 +140,38 @@ def benchmark_shape(
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
-                "DeepGEMM": 0.0
+                "DeepGEMM":
-                if name == "DeepGEMM"
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
-                else calc_diff(func(), C_deepgemm),
+                "Reference":
-                "Reference": deepgemm_diff
+                deepgemm_diff if name == "DeepGEMM" else
-                if name == "DeepGEMM"
+                (vllm_triton_diff
-                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
-            },
+            }
        }
        if verbose:
-            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
+            print(
                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
            )
    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name][
                "speedup_vs_deepgemm"] = speedup
            if verbose:
-                print(
+                print(f"DeepGEMM is {1/speedup:.2f}x "
-                    f"DeepGEMM is {1 / speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
                )
-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
+        "time_ms"]
    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
        "time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+    benchmark_results["implementations"]["vLLM CUTLASS"][
-        cutlass_vs_triton
+        "speedup_vs_triton"] = cutlass_vs_triton
    )
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +183,8 @@ def benchmark_shape(
 def format_table_row(values, widths):
    """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}"
                             for val, w in zip(values, widths)) + " |"
 def print_table(headers, rows, title=None):
@ -292,50 +292,38 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append(
+        deepgemm_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
-                shape["n"],
+        ])
                shape["k"],
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
            ]
        )
-    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers,
                deepgemm_rows,
                title="DeepGEMM Implementation:")
    # Print vLLM Triton table
-    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
+    triton_headers = [
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
    ]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append(
+        triton_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(speedup)
-                shape["k"],
+        ])
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(speedup),
            ]
        )
-    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
+    print_table(triton_headers,
                triton_rows,
                title="vLLM Triton Implementation:")
    # Print vLLM CUTLASS table
    cutlass_headers = [
-        "m",
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
-        "n",
+        "vs Triton"
        "k",
        "Time (μs)",
        "TFLOPS",
        "GB/s",
        "vs DeepGEMM",
        "vs Triton",
    ]
    cutlass_rows = []
    for result in all_results:
@ -343,27 +331,28 @@ def run_benchmarks(verbose: bool = False):
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append(
+        cutlass_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(vs_deepgemm),
-                shape["k"],
+            format_speedup(vs_triton)
-                f"{impl_data['time_us']:.1f}",
+        ])
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(vs_deepgemm),
                format_speedup(vs_triton),
            ]
        )
-    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers,
                cutlass_rows,
                title="vLLM CUTLASS Implementation:")
    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")
    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
-        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
+        impl: {
            "tflops": 0,
            "gb_s": 0,
            "time_ms": 0
        }
        for impl in implementations
    }
    for result in all_results:
@ -381,9 +370,9 @@ def run_benchmarks(verbose: bool = False):
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append(
+        avg_rows.append([
-            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
-        )
+        ])
    print_table(avg_headers, avg_rows)
@ -391,19 +380,21 @@ def run_benchmarks(verbose: bool = False):
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
    }
    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
            "time_ms"]
-        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
-        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+        avg_speedups[
-            vllm_triton_time / vllm_cutlass_time
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        )
+        avg_speedups[
            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +412,8 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"][
                "Reference"]
    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
-from typing import Any
+from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
    def __init__(
        self,
-        cuda_graph_params: CudaGraphBenchParams | None,
+        cuda_graph_params: Optional[CudaGraphBenchParams],
        label: str,
        sub_label: str,
        description: str,
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple
+from typing import Any, NamedTuple, Optional, Union
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
    def __init__(
        self,
-        min_val: int | float,
+        min_val: Union[int, float],
-        max_val: int | float,
+        max_val: Union[int, float],
        is_integer: bool = True,
    ) -> None:
        self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
 class ConstantDistribution(Distribution):
-    def __init__(self, value: int | float) -> None:
+    def __init__(self, value: Union[int, float]) -> None:
        self.value = value
        self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
    def __init__(
        self,
-        mean: float | None = None,
+        mean: Optional[float] = None,
-        sigma: float | None = None,
+        sigma: Optional[float] = None,
-        average: int | None = None,
+        average: Optional[int] = None,
-        median_ratio: float | None = None,
+        median_ratio: Optional[float] = None,
-        max_val: int | None = None,
+        max_val: Optional[int] = None,
    ) -> None:
        self.average = average
        self.median_ratio = median_ratio
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple
+from typing import NamedTuple, Optional, Union
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
 class ClientArgs(NamedTuple):
    seed: int
-    max_num_requests: int | None
+    max_num_requests: Optional[int]
    skip_first_turn: bool
-    max_turns: int | None
+    max_turns: Optional[int]
    max_active_conversations: int
    verbose: bool
    print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
 class MetricStats:
    def __init__(self) -> None:
-        self.min: float | None = None
+        self.min: Optional[float] = None
-        self.max: float | None = None
+        self.max: Optional[float] = None
-        self.avg: float | None = None
+        self.avg: Optional[float] = None
        self.sum = 0.0
        self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
        self.index = 0
        self.sum = 0.0
        self.count = 0
-        self.avg: float | None = None
+        self.avg: Optional[float] = None
    def update(self, new_value: float) -> None:
        if self.count < self.window_size:
@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
        self.logger = logger
-        self.metrics: dict[str, MovingAverage | MetricStats] = {
+        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
            "moving_avg_ttft_ms": MovingAverage(window_size),
            "moving_avg_tpot_ms": MovingAverage(window_size),
            "ttft_ms": MetricStats(),
@ -198,6 +198,14 @@ class DebugStats:
        self.logger.info("-" * 50)
 # Must support Python 3.8, we can't use str.removeprefix(prefix)
 # introduced in Python 3.9
 def remove_prefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text
 def nanosec_to_millisec(value: float) -> float:
    return value / 1000000.0
@ -212,8 +220,8 @@ async def send_request(
    chat_url: str,
    model: str,
    stream: bool = True,
-    min_tokens: int | None = None,
+    min_tokens: Optional[int] = None,
-    max_tokens: int | None = None,
+    max_tokens: Optional[int] = None,
 ) -> ServerResponse:
    payload = {
        "model": model,
@ -242,9 +250,9 @@ async def send_request(
    timeout = aiohttp.ClientTimeout(total=timeout_sec)
    valid_response = True
-    ttft: float | None = None
+    ttft: Optional[float] = None
    chunk_delay: list[int] = []
-    latency: float | None = None
+    latency: Optional[float] = None
    first_chunk = ""
    generated_text = ""
@ -261,7 +269,7 @@ async def send_request(
                if not chunk_bytes:
                    continue
-                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
                if chunk == "[DONE]":
                    # End of stream
                    latency = time.perf_counter_ns() - start_time
@ -356,7 +364,7 @@ async def send_turn(
    req_args: RequestArgs,
    verbose: bool,
    verify_output: bool,
-) -> RequestStats | None:
+) -> Optional[RequestStats]:
    assert messages_to_use > 0
    assert messages_to_use <= len(conversation_messages)
@ -636,7 +644,7 @@ async def client_main(
            if args.verbose:
                curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: str | float = "N/A"
+                time_since_last_turn: Union[str, float] = "N/A"
                if conv_id in time_of_last_turn:
                    time_since_last_turn = round(
                        curr_time_sec - time_of_last_turn[conv_id], 3
@ -761,7 +769,7 @@ def get_client_config(
            "Number of conversations must be equal or larger than the number of clients"
        )
-    max_req_per_client: int | None = None
+    max_req_per_client: Optional[int] = None
    if args.max_num_requests is not None:
        # Max number of requests per client
        req_per_client = args.max_num_requests // args.num_clients
@ -928,13 +936,13 @@ async def main_mp(
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                )
-                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
+                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
                if len(client_metrics) < (5 * bench_args.num_clients):
                    # Do not estimate the RPS if the number of samples is very low
                    # (threshold can be tuned if needed)
                    rps = "N/A"
-                runtime_left_sec: str | float = round(
+                runtime_left_sec: Union[str, float] = round(
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                )
                if percent < 0.05:
@ -1024,7 +1032,7 @@ def process_statistics(
    warmup_percentages: list[float],
    test_params: dict,
    verbose: bool,
-    gen_conv_args: GenConvArgs | None = None,
+    gen_conv_args: Optional[GenConvArgs] = None,
    excel_output: bool = False,
 ) -> None:
    if len(client_metrics) == 0:
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any
+from typing import Any, Optional
 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 def content_is_valid(
-    content: str, min_content_len: int | None, max_content_len: int | None
+    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
 ) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False
@ -37,7 +37,7 @@ def content_is_valid(
 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
+    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
 ) -> None:
    # Collect statistics
    stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
-    max_items: int | None,
+    max_items: Optional[int],
-    min_content_len: int | None = None,
+    min_content_len: Optional[int] = None,
-    max_content_len: int | None = None,
+    max_content_len: Optional[int] = None,
-    min_turns: int | None = None,
+    min_turns: Optional[int] = None,
-    max_turns: int | None = None,
+    max_turns: Optional[int] = None,
-    model: str | None = None,
+    model: Optional[str] = None,
 ) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@ -0,0 +1,49 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.lint.isort]
 known-first-party = ["vllm"]
 [tool.ruff.format]
 docstring-code-format = true
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -198,24 +198,13 @@ else()
 endif()
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
-    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+    FetchContent_Declare(
-
+        oneDNN
-    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+        GIT_TAG v3.9
-        FetchContent_Declare(
+        GIT_PROGRESS TRUE
-            oneDNN
+        GIT_SHALLOW TRUE
-            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+    )
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
        FetchContent_Declare(
            oneDNN
            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
            GIT_TAG v3.9
            GIT_PROGRESS TRUE
            GIT_SHALLOW TRUE
        )
    endif()
    if(USE_ACL)
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -224,7 +213,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -320,4 +308,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -1,97 +0,0 @@
 include(FetchContent)
 set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
 if(DEFINED ENV{QUTLASS_SRC_DIR})
  set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
 endif()
 if(QUTLASS_SRC_DIR)
  FetchContent_Declare(
    qutlass
    SOURCE_DIR ${QUTLASS_SRC_DIR}
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
 else()
  FetchContent_Declare(
    qutlass
    GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
    GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
    GIT_PROGRESS TRUE
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
 endif()
 FetchContent_Populate(qutlass)
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
 message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
 cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
  if(QUTLASS_ARCHS MATCHES "10\\.0a")
    set(QUTLASS_TARGET_CC 100)
  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
    set(QUTLASS_TARGET_CC 120)
  else()
    message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
  endif()
  set(QUTLASS_SOURCES
    ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
  )
  set(QUTLASS_INCLUDES
    ${qutlass_SOURCE_DIR}
    ${qutlass_SOURCE_DIR}/qutlass
    ${qutlass_SOURCE_DIR}/qutlass/csrc/include
    ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
  )
  if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
    list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
  elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
    list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
    message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
  else()
    message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
                        "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
  endif()
  set_gencode_flags_for_srcs(
    SRCS "${QUTLASS_SOURCES}"
    CUDA_ARCHS "${QUTLASS_ARCHS}"
  )
  target_sources(_C PRIVATE ${QUTLASS_SOURCES})
  target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
  target_compile_definitions(_C PRIVATE
    QUTLASS_DISABLE_PYBIND=1
    TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
  )
  set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
  )
 else()
  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
    message(STATUS
      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
  else()
    message(STATUS
      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
      "CUDA_ARCHS='${CUDA_ARCHS}'.")
  endif()
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -16,7 +16,7 @@ import shutil
 from torch.utils.hipify.hipify_python import hipify
-if __name__ == "__main__":
+if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
    )
    # Source files to convert.
-    parser.add_argument(
+    parser.add_argument("sources",
-        "sources", help="Source files to hipify.", nargs="*", default=[]
+                        help="Source files to hipify.",
-    )
+                        nargs="*",
                        default=[])
    args = parser.parse_args()
    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, "*")]
+    includes = [os.path.join(args.project_dir, '*')]
    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
-    hipify_result = hipify(
+    hipify_result = hipify(project_directory=args.project_dir,
-        project_directory=args.project_dir,
+                           output_directory=args.output_dir,
-        output_directory=args.output_dir,
+                           header_include_dirs=[],
-        header_include_dirs=[],
+                           includes=includes,
-        includes=includes,
+                           extra_files=extra_files,
-        extra_files=extra_files,
+                           show_detailed=True,
-        show_detailed=True,
+                           is_pytorch_extension=True,
-        is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
        hipify_extra_files_only=True,
    )
    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
-        hipified_s_abs = (
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-            hipify_result[s_abs].hipified_path
+                          (s_abs in hipify_result
-            if (
+                           and hipify_result[s_abs].hipified_path is not None)
-                s_abs in hipify_result
+                          else s_abs)
                and hipify_result[s_abs].hipified_path is not None
            )
            else s_abs
        )
        hipified_sources.append(hipified_s_abs)
-    assert len(hipified_sources) == len(args.sources)
+    assert (len(hipified_sources) == len(args.sources))
    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/codecov.yml
+++ b/codecov.yml
@ -1,12 +0,0 @@
 codecov:
  require_ci_to_pass: false
 fixes:
  # Map source code paths to repository root paths
  # Wildcards match any Python version (python3.*)
  - "/vllm-workspace/src/vllm/::vllm/"
  - "/vllm-workspace/vllm/::vllm/"
  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,37 +125,32 @@ public:
  }
  static void set_split_kv (KernelArguments& args) {
    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = 1;
+    int max_splits = ceil_div(K, 128);
    max_splits = min(16, max_splits);
-    if (B <= 4 && seq_length_k >= 16) {
+    // TODO: This avoids a hang when the batch size larger than 1 and 
-      max_splits = 16;
+    // there is more than 1 kv_splits. 
    // Discuss with NVIDIA how this can be fixed.
    if (B > 1) {
      max_splits = min(1, max_splits);
    }
-    else if (B <= 8 && seq_length_k >= 4) {
+    
-      max_splits = 8;
+    // printf("    max_splits = %d\n", max_splits);
    }
    else if ((B <= 16 && seq_length_k >= 8) ||
             (B == 48 && seq_length_k >= 32)) {
      max_splits = 4;
    }
    else if ((B <= 32 && seq_length_k >= 16) ||
             (B == 96 && seq_length_k >= 16)) {
      max_splits = 2;
    }
    else {
      max_splits = 1;
    }
    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
    // printf("    args.split_kv = %d\n", args.split_kv);
  }
  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -64,11 +64,3 @@ void indexer_k_quant_and_cache(
    torch::Tensor& slot_mapping,  // [num_tokens]
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,   // [batch_size, num_blocks]
    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -572,70 +572,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
  }
 }
 template <int BLOCK_Y_SIZE>
 __global__ void cp_gather_indexer_k_quant_cache_kernel(
    const char* __restrict__ kv_cache,  // [num_blocks, block_size,
                                        // cache_stride]
    char* __restrict__ dst_k,           // [num_tokens, head_dim]
    char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size *
                                   // 4]
    const int* __restrict__ block_table,  // [batch_size, num_blocks]
    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
    const int batch_size,                 // batch size
    const int64_t token_stride,           // stride for each token in dst_k
    const int64_t head_dim,               // dimension of each head
    const int64_t block_stride,           // stride for each block in kv_cache
    const int64_t cache_token_stride,     // stride for each token in kv_cache
    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
    const int num_blocks,            // number of blocks
    const int num_tokens,            // number of tokens
    const int quant_block_size       // quantization block size
 ) {
  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
  // Find batch index within a block
  __shared__ int batch_idx[BLOCK_Y_SIZE];
  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
       iter++) {
    int tid = iter * blockDim.x + threadIdx.x;
    if (tid < batch_size) {
      const int seq_start = cu_seq_lens[tid];
      const int seq_end = cu_seq_lens[tid + 1];
      if (token_idx >= seq_start && token_idx < seq_end) {
        batch_idx[threadIdx.y] = tid;
      }
    }
  }
 #ifndef USE_ROCM
  __syncwarp();
 #endif
  if (head_idx >= head_dim || token_idx >= num_tokens) {
    return;
  }
  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
                                    inbatch_seq_idx / cache_block_size];
  const int64_t src_block_offset = block_idx * block_stride;
  const int64_t cache_inblock_offset =
      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
  ;
  if (threadIdx.x == 0) {
    const int64_t src_scale_offset =
        src_block_offset + cache_block_size * head_dim +
        cache_inblock_offset * 4 / quant_block_size;
    reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
        reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
  }
 }
 }  // namespace vllm
 // KV_T is the data type of key and value tensors.
@ -1237,59 +1173,3 @@ void indexer_k_quant_and_cache(
  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
                             CALL_INDEXER_K_QUANT_AND_CACHE);
 }
 // Macro to dispatch the kernel based on the data amount.
 #define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \
  vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \
      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
          reinterpret_cast<char*>(dst_scale.data_ptr()),                    \
          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
          num_tokens, quant_block_size);
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,  // [batch_size, num_blocks]
    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
 ) {
  int batch_size = block_table.size(0);
  int num_tokens = dst_k.size(0);
  int head_dim = dst_k.size(1);
  int quant_block_size = head_dim * 4 / dst_scale.size(1);
  TORCH_CHECK(kv_cache.device() == dst_k.device(),
              "kv_cache and dst_k must be on the same device");
  TORCH_CHECK(kv_cache.device() == dst_scale.device(),
              "kv_cache and dst_scale must be on the same device");
  TORCH_CHECK(kv_cache.device() == block_table.device(),
              "kv_cache and block_table must be on the same device");
  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
              "kv_cache and cu_seq_lens must be on the same device");
  TORCH_CHECK(head_dim % quant_block_size == 0,
              "head_dim must be divisible by quant_block_size");
  constexpr int vec_size = 16;
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (num_tokens < 32) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
  } else if (num_tokens < 64) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
  } else if (num_tokens < 128) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
  } else if (num_tokens < 256) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
  } else if (num_tokens < 512) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
  } else {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -5,15 +5,12 @@
 namespace vllm {
-// vllm_is_batch_invariant(); returns true
+// vllm_kernel_override_batch_invariant(); returns true
-// if env VLLM_BATCH_INVARIANT=1
+// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
-inline bool vllm_is_batch_invariant() {
+inline bool vllm_kernel_override_batch_invariant() {
-  static bool cached = []() {
+  std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
-    std::string env_key = "VLLM_BATCH_INVARIANT";
+  const char* val = std::getenv(env_key.c_str());
-    const char* val = std::getenv(env_key.c_str());
+  return (val && std::atoi(val) != 0) ? 1 : 0;
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
  return cached;
 }
 }  // namespace vllm
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
 }
 void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc original_b_md,
+    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
-    dnnl::memory::desc b_target_mem_desc) {
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
  {
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
    assert(!use_azp_);
  };
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+  prepack_weight(args.b_ptr,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  assert(ab_type_ == dnnl::memory::data_type::f32 ||
         ab_type_ == dnnl::memory::data_type::bf16 ||
         ab_type_ == dnnl::memory::data_type::f16);
-
+  prepack_weight(args.b_ptr,
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-#ifdef VLLM_USE_ACL
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-                         // Arm Compute Library (ACL) backend for oneDNN does
+                                   .use_bias = false,
-                         // not support runtime
+                                   .bias_type = dnnl::memory::data_type::undef},
                         // dimensions, so we set M to a default value
                         .a_m_size = 128,
                         .a_m_stride = b_k_size_,
 #else
                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
 #endif
                         .use_bias = false,
                         .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;
 #ifndef VLLM_USE_ACL
  // We do not support in ACL backend of oneDNN, we handle bias by:
  // 1. copying it into the result tensor
  // 2. attaching a fused-sum post-op to the matmul primitive
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
 #endif
  dnnl::matmul matmul = get_matmul_cache(args);
-// With ACL backend of oneDNN, the required memory format might change when the
+  dnnl::matmul matmul = get_matmul_cache(args);
 // source tensor dims change. This does not really happen in practice, so isn't
 // a performance hit, but we need to support it because the API allows for it.
 #ifdef VLLM_USE_ACL
  auto new_expected_wei_desc =
      dnnl::matmul::primitive_desc(
          const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
          .weights_desc();
  if (new_expected_wei_desc != b_target_mem_desc_) {
    prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
                   b_target_mem_desc_, new_expected_wei_desc);
  }
 #endif
  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
  scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  } else {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              {key.a_m_stride, 1});
 #ifdef VLLM_USE_ACL
    // ACL's backend of oneDNN always expects the weight format to be "any"
    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
                              dnnl::memory::format_tag::any);
 #else
    b_md = b_target_mem_desc_;
 #endif
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  if (key.use_bias) {
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
 // Since ACL's matmuls don't support passing a bias_md, we apply the bias
 // through a fused-sum post-op
 #ifdef VLLM_USE_ACL
    dnnl::post_ops post_ops;
    post_ops.append_sum();
    attr.set_post_ops(post_ops);
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
 #else
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
 #endif
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
 // ACL matmuls don't support bias_md, so we don't need these
 #ifndef VLLM_USE_ACL
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-#endif
+
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 bool is_onednn_acl_supported() {
 #ifdef VLLM_USE_ACL
  return true;
 #else
  return false;
 #endif
 }
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
 protected:
  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
-  void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
+  void prepack_weight(void* original_b_ptr,
                      dnnl::memory::desc b_target_mem_desc);
  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
  torch::Tensor a_contig = a.contiguous();
 #endif
  MatMulPrimitiveHandler::ExecArgs exec_args;
 #ifdef VLLM_USE_ACL
  exec_args.a_m_size = a_contig.size(0);
  exec_args.a_m_stride = a_contig.stride(0);
 #else
  exec_args.a_m_size = a.size(0);
  exec_args.a_m_stride = a.stride(0);
-#endif
+
  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
    if (bias.has_value()) {
      exec_args.use_bias = true;
      exec_args.bias_type = get_dnnl_type<scalar_t>();
 #ifdef VLLM_USE_ACL
      // ACL matmuls in oneDNN do not support a bias.
      // We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
      c.copy_(bias.value());
 #else
      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
 #endif
    } else {
      exec_args.use_bias = false;
      exec_args.bias_type = get_dnnl_type<void>();
      exec_args.bias_ptr = nullptr;
    }
 #ifdef VLLM_USE_ACL
    exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
 #else
    exec_args.a_ptr = a.data_ptr<scalar_t>();
 #endif
    exec_args.c_ptr = c.data_ptr<scalar_t>();
    ptr->execute(exec_args);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
               const std::optional<torch::Tensor>& bias, int64_t handler);
 bool is_onednn_acl_supported();
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -183,9 +181,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Check if oneDNN was built with ACL backend
  ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 from typing import Union
 from cutlass_library import *
@ -21,31 +22,31 @@ class MixedInputKernelScheduleType(enum.Enum):
    TmaWarpSpecializedCooperative = enum_auto()
-VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeNames,  # type: ignore
    **{
        VLLMDataType.u4b8: "u4b8",
        VLLMDataType.u8b128: "u8b128",
-    },
+    }
 }
-VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeTag,  # type: ignore
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    },
+    }
 }
-VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
+VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **DataTypeSize,  # type: ignore
    **{
        VLLMDataType.u4b8: 4,
        VLLMDataType.u8b128: 8,
-    },
+    }
 }
-VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    VLLMDataType.u4b8: "vllm::kU4B8",
    VLLMDataType.u8b128: "vllm::kU8B128",
    DataType.u4: "vllm::kU4",
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.bf16: "vllm::kBfloat16",
 }
-VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.u8: "at::ScalarType::Byte",
    DataType.s8: "at::ScalarType::Char",
    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -66,11 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.f32: "at::ScalarType::Float",
 }
-VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
+VLLMKernelScheduleTag: dict[Union[
-    **KernelScheduleTag,  # type: ignore
+    MixedInputKernelScheduleType, KernelScheduleType], str] = {
-    **{
+        **KernelScheduleTag,  # type: ignore
-        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
+        **{
-        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
-        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
+            "cutlass::gemm::KernelTmaWarpSpecialized",
-    },
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
-}
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
        }
    }
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -2,7 +2,6 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
  const scalar_t* input_row = input + blockIdx.x * input_stride;
-  constexpr int VEC_SIZE = 8;
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+    const float x = (float)input[blockIdx.x * input_stride + idx];
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  };
+  }
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -426,7 +414,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -474,7 +462,7 @@ void poly_norm(torch::Tensor& out,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -10,7 +10,6 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  __shared__ float s_variance;
  float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-
+    const float x = (float)input[blockIdx.x * input_stride + idx];
  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  };
+  }
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -254,7 +241,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -17,30 +17,25 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
@ -63,12 +58,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -8,77 +8,12 @@
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 #include "core/math.hpp"
 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
 namespace vllm {
 namespace moe {
 namespace batched_moe_align_block_size {
 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
 static constexpr int32_t num_threads = 1024;
 static constexpr int32_t num_blocks = 1;
 __global__ void batched_moe_align_block_size_kernel(
    int32_t const num_batches, int32_t const max_tokens_per_batch,
    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
    int32_t* __restrict__ num_tokens_post_pad) {
  // TODO(varun): This is a naive implementation. Could be optimized.
  size_t const batch_id = threadIdx.x;
  size_t const stride = blockDim.x * gridDim.x;
  int32_t const num_blocks_per_batch =
      CEILDIV(max_tokens_per_batch, block_size);
  int32_t const sorted_ids_size =
      num_blocks_per_batch * num_batches * block_size;
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
  // Intialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
  // Intialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
  int32_t b_num_tokens = 0;
  if (batch_id < num_batches) {
    b_num_tokens = batch_num_tokens[batch_id];
  }
  int32_t const ceil_b_num_tokens =
      CEILDIV(b_num_tokens, block_size) * block_size;
  // Compute prefix sum over token counts per expert
  using BlockScan = cub::BlockScan<int32_t, 1024>;
  __shared__ typename BlockScan::TempStorage temp_storage;
  int cumsum_val;
  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
  __syncthreads();
  bool const is_last_batch = batch_id == (num_batches - 1);
  if (is_last_batch) {
    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
  }
  if (batch_id < num_batches) {
    int32_t const batch_offset = batch_id * max_tokens_per_batch;
    for (size_t i = 0; i < b_num_tokens; ++i) {
      sorted_ids[cumsum_val + i] = batch_offset + i;
    }
    int32_t const block_start = cumsum_val / block_size;
    int32_t const num_blocks = ceil_b_num_tokens / block_size;
    for (size_t i = 0; i < num_blocks; ++i) {
      block_ids[block_start + i] = batch_id;
    }
  }
 }
 }  // namespace batched_moe_align_block_size
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
@ -345,33 +280,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
      });
 }
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& batch_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor batch_ids,
                                  torch::Tensor num_tokens_post_pad) {
  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t const B = batch_num_tokens.size(0);
  int32_t const num_blocks_per_batch =
      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
  int32_t const num_blocks = num_blocks_per_batch * B;
  int64_t const sorted_ids_size = num_blocks * block_size;
  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
  TORCH_CHECK(B <= batched_kernel::num_threads);
  batched_kernel::batched_moe_align_block_size_kernel<<<
      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
      num_tokens_post_pad.data_ptr<int32_t>());
 }
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -12,14 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& expert_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor expert_ids,
                                  torch::Tensor num_tokens_post_pad);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -21,6 +21,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #include "../core/batch_invariant.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -22,17 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size, but for the batched case.
  m.def(
      "batched_moe_align_block_size(int max_tokens_per_batch,"
      "                     int block_size, Tensor expert_num_tokens,"
      "                     Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("batched_moe_align_block_size", torch::kCUDA,
         &batched_moe_align_block_size);
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -100,11 +100,6 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& output_mask,
                                 const torch::Tensor& repetition_penalties);
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
                   torch::Tensor& values, int64_t numRows, int64_t stride0,
                   int64_t stride1);
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
                               double epsilon);
@ -138,12 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
-void persistent_masked_m_silu_mul_quant(
+void silu_mul_fp8_quant_deep_gemm_cuda(
    const at::Tensor& input,   // (E, T, 2*H)
    const at::Tensor& counts,  // (E)
    at::Tensor& y_q,           // (E, T, H) [OUT]
    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    bool use_ue8m0);
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -114,22 +114,13 @@ __global__ void act_and_mul_quant_kernel(
 }
 __device__ __forceinline__ float silu(float x) {
-  return __fdividef(x, (1.f + expf(-x)));
+  return (__fdividef(x, (1.f + expf(-x))));
 }
 __device__ __forceinline__ float2 silu2(float2 x) {
  return make_float2(silu(x.x), silu(x.y));
 }
 __device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
 #ifndef USE_ROCM
  return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
                        __float2bfloat16_rn(silu(x.y)));
 #else
  return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
 #endif
 }
 #ifndef USE_ROCM
 __device__ __forceinline__ float warp_max(float v) {
  static constexpr unsigned FULL_MASK = 0xffffffffu;
@ -232,308 +223,224 @@ constexpr __nv_bfloat16 get_fp8_min() {
    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
  }
 }
-
+#ifndef USE_ROCM
-template <typename Idx_t>
+template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
-__device__ __forceinline__ int warp_expert_search(
+          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
    int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
  const Idx_t* input_ptr = input + idx;
  int base_offset = 0;
  for (;;) {
    bool move_on = (idx < n && *input_ptr <= val);
    unsigned mask = __ballot_sync(0xffffffff, move_on);
    if (mask != 0xffffffffu) {
      int last_lane = 31 - __clz(mask);
      return base_offset + last_lane;
    }
    input_ptr += 32;
    base_offset += 32;
    idx += 32;
  }
 }
 template <int num_parallel_tokens>
 __device__ __forceinline__ void token_bounds(int32_t n_tokens,
                                             int32_t worker_id,
                                             int32_t& n_tokens_lower,
                                             int32_t& n_tokens_upper) {
  if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
    if (worker_id >= num_parallel_tokens) return;
    n_tokens_lower = worker_id;
    n_tokens_upper = worker_id + 1;
  } else {
    int32_t chunk_size = n_tokens / num_parallel_tokens;
    int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
    auto calc_id = [&](int32_t id) {
      if (id < residual)
        return min(n_tokens, id * (chunk_size + 1));
      else
        return min(n_tokens, id * chunk_size + residual);
    };
    n_tokens_lower = calc_id(worker_id);
    n_tokens_upper = calc_id(worker_id + 1);
  }
 }
 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
          int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
    // sizes
-    Idx_t E, Idx_t T, Idx_t H,
+    int H, int G,
    // strides (in elements)
    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
    Idx_t stride_ys_g, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
  static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
  static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
  static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
  static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
  static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
  extern __shared__ __align__(16) __int128_t smem_128[];
  int* s_expert_offsets =
      reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
-  // We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
+  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
  int tid = threadIdx.x;
  int warp_id = tid >> 5;
  int lane_id = tid & 0x1f;
-  int running_sum{};
+  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
-  if (!warp_id) {
+  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
    for (int i = 0; i < E; i += WARP_SIZE) {
      bool valid = (i + threadIdx.x) < E;
      int value =
          (valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
          (!lane_id ? running_sum : 0);
-      for (int offset = 1; offset < 32; offset *= 2) {
+  // We split the shared memory in half, corresponding to gate and up matrices:
-        int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
+  // [...gate_i, ...up_i]  where 0 <= i < stages.
-        if (lane_id >= offset) value += n;
+  static constexpr int32_t S_NUM_128 =
-      }
+      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
-      if (valid) {
+  const int32_t tid = threadIdx.x;
-        s_expert_offsets[i + threadIdx.x + 1] = value;
+  const int32_t warp_id = tid / WARP_SIZE;
-      }
+  const int32_t lane_id = tid % WARP_SIZE;
-      running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
+  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
    }
-    if (!lane_id) {
+  // block handles one (expert e, group g)
-      s_expert_offsets[0] = 0;
+  int32_t pid = blockIdx.x;
-    }
+  int32_t e = pid / G;
  int32_t g = pid % G;
  const int32_t n_tokens = counts[e * stride_counts_e];
  if (!n_tokens) {
    return;  // Exit ASAP.
  }
-  __syncthreads();
+  const Idx_t stride_i_t_128 = stride_i_t / 8u;
-  int32_t total_tokens = s_expert_offsets[E];
+  int32_t n_tokens_lower, n_tokens_upper;
  const int warp_position_yq = warp_id * (H / NUM_WARPS);
  const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
  // A single block will handle tokens_per_block tokens.
  // Each block i iterates over tokens of a slice of n_tokens =
  // expert_counts[i], with the size of chunk being
  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
-
+  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
-  // Each warp will get space to store its hidden dim for gate and up.
+    // Specialize this, but can be likely fused.
-  __int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
+    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
-  __int128_t* smem_load_ptr = s_hidden_load + lane_id;
+      return;
-
+    }
-  const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
+    n_tokens_lower = blockIdx.y;
-
+    n_tokens_upper = blockIdx.y + 1;
  int32_t compute_pipeline_offset_64 = 0;
  int32_t load_stage_offset{};
  const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
  __int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
                                warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
                                lane_id;
  __int64_t* s_gate64_ptr = smem_compute_ptr;
  __int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
  int tokens_lower, tokens_upper;
  token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
                            tokens_upper);
  Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
  int token_id = tokens_lower;
  int32_t t_load{};
  if (token_id < tokens_upper) {
    expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
    expert_offset = s_expert_offsets[expert_id];
    next_expert_offset = s_expert_offsets[expert_id + 1];
  } else {
-    // This thread block has no work to do.
+    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
    auto calc_id = [&](int32_t id) {
      if (id < residual) {
        return min(n_tokens, id * (chunk_size + 1));
      } else {
        return min(n_tokens, id * chunk_size + residual);
      }
    };
    n_tokens_lower = calc_id(blockIdx.y);
    n_tokens_upper = calc_id(blockIdx.y + 1);
  }
  if (n_tokens_lower >= n_tokens_upper) {
    return;
  }
-  int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
+  // We do calculations here, using constexpr wherever possible.
  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
  const Idx_t base_yq =
      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
                      stride_i_t_128 * n_tokens_lower;
  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
  auto y_s_ptr =
      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
                 stride_yq_t * n_tokens_lower + 4 * lane_id;
  int32_t t_load = n_tokens_lower, load_stage_id = 0;
  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
  int32_t stage_offset{};
-  Idx_t base_i = ((expert_id * stride_i_e) / 8) +
+  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
-                 (token_id - expert_offset) * stride_i_t / 8;
+  static constexpr int32_t LOAD_STAGE_MOD =
-  const Idx_t gate_warp_offset =
+      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
      warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
  const __int128_t* input_128_ptr =
      reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
      ((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
  __int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
  auto token_offset = token_id - expert_offset;
  // Two halves of all threads in a block conduct global loads for gate and up,
  // repsectively.
  auto load_and_advance_y_pred = [&] {
-    if (t_load < t_load_bound) {
+    if (t_load < n_tokens_upper) {
-      // Here we are simply continuing to load data
+      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
-      // from the current token.
+      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
      // unnecessary ALU ops.
-      load_stage_offset += LOAD_STAGE_SIZE;
+      stage_offset += LOAD_STAGE_SIZE;
-      load_stage_offset %= LOAD_STAGE_MOD;
+      stage_offset %= LOAD_STAGE_MOD;
-      cp_async4(smem_load_ptr_staged, load_ptr);
+      if (tid < HALF_THREAD_COUNT) {
-      load_ptr += GROUP_SIZE / 8;
+        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
-      ++t_load;
+        gate_128_ptr += stride_i_t_128;
    } else if (token_id + 1 < tokens_upper) {
      // We loaded everything from the current token, let's move on
      // to the next one, and we checked that we have more tokens to load.
      ++token_id;
      t_load = 0;
      if (token_id >= next_expert_offset) {
        // We need to find the next expert.
        do {
          // This is a loop because it's possible
          // that some experts are assigned 0 tokens.
          // NOTE: We are guaranteed that there's at least
          // one more token left so we don't have to check for
          // expert_id bounds.
          ++expert_id;
          // This skips 1 memory read.
          expert_offset = next_expert_offset;
          next_expert_offset = s_expert_offsets[expert_id + 1];
        } while (next_expert_offset == expert_offset);
        base_i = expert_id * (stride_i_e / 8);
        token_offset = 0;
        load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
      } else {
-        // We remain within the same expert, so just
+        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
-        // move by H/4 __int128_t (2 * H/8).
+        up_128_ptr += stride_i_t_128;
        base_i += stride_yq_t / 4;
        token_offset++;
      }
      load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
      // unnecessary ALU ops.
      load_stage_offset += LOAD_STAGE_SIZE;
      load_stage_offset %= LOAD_STAGE_MOD;
      cp_async4(smem_load_ptr_staged, load_ptr);
      load_ptr += GROUP_SIZE / 8;
      ++t_load;
      ++load_stage_id;
    }
    // We fence even if there is nothing to load to simplify pipelining.
    cp_async_fence();
  };
  // We need to warm-up the pipeline.
  #pragma unroll
  for (int i = 0; i < NUM_STAGES - 1; i++) {
    load_and_advance_y_pred();
  }
-  __nv_fp8x4_e4m3* y_q_base_ptr =
+  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
-      reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
+                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+                          lane_id;
  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
-  for (auto j = tokens_lower; j < tokens_upper; j++) {
+  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
-    const Idx_t base_ys = expert_id * stride_ys_e;
+  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
    auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
    __nv_fp8x4_e4m3* y_q_ptr =
        y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
                        warp_position_yq * stride_yq_h) /
                           4;
    const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
-    for (int i = 0; i < COMPUTE_LIMIT; i++) {
+  int32_t compute_pipeline_offset_64 = 0;
      cp_async_wait<NUM_STAGES - 2>();
      __syncthreads();
      load_and_advance_y_pred();
-      __int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
+  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-      __int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
+    __nv_bfloat162 results_bf162[2];
-      // COMPUTE_STAGE_SIZE/MOD must also be constexpr!
+    cp_async_wait<NUM_STAGES - 2>();
-      compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
+    __syncthreads();
      compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
-      __int64_t gate64 = *gate64_ptr;
+    // We double-buffer pipelined loads so that the next load will
-      __int64_t up64 = *up64_ptr;
+    // concurrently run with compute without overwrites.
    load_and_advance_y_pred();
-      // Compute
+    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
-      __nv_bfloat162 res[2];
+    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
-      __nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
+
-      __nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
+    // STAGE_SIZE must also be constexpr!
    compute_pipeline_offset_64 += STAGE_SIZE;
    compute_pipeline_offset_64 %= STAGE_MOD;
    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
    __int64_t gate64 = *s_gate_compute_64;
    __nv_bfloat162* s_gate_compute_32 =
        reinterpret_cast<__nv_bfloat162*>(&gate64);
    __int64_t up64 = *s_up_compute_64;
    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
  #pragma unroll
-      for (int32_t k = 0; k < 2; ++k) {
+    for (int i = 0; i < 2; i++) {
-        __nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
+      // For silu, we make sure that div is emitted.
-        res[k] = __hmul2(gate, s_up_comp[k]);
+      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
-      }
+      results_bf162[i] = __float22bfloat162_rn(gate);
-
+    }
      auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
      _y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
      __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
      if constexpr (USE_UE8M0) {
        y_s = hexp2(hceil(hlog2(y_s)));
      }
      __nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
      auto y_s2 = make_bfloat162(inv_y, inv_y);
  #pragma unroll
-      for (int32_t k = 0; k < 2; ++k) {
+    for (int i = 0; i < 2; i++) {
-        res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
+      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
-                      __bfloat162bfloat162(fp8_max));
+    }
      }
-      *y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
+    auto _y_max2 =
-      y_q_ptr += WARP_SIZE * stride_yq_h;
+        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
-      if (!lane_id) {
+    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
-        *y_s_ptr = y_s;
+
-        y_s_ptr += stride_ys_g;
+    // An entire group is assigned to a single warp, so a simple warp reduce
-      }
+    // is used.
    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
    if constexpr (USE_UE8M0) {
      y_s = hexp2(hceil(hlog2(y_s)));
    }
    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
    auto y_s2 = make_bfloat162(inv_y, inv_y);
  #pragma unroll
    for (int32_t i = 0; i < 2; ++i) {
      results_bf162[i] =
          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
               __bfloat162bfloat162(fp8_max));
    }
    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
    y_q_ptr += stride_yq_t;
    if (lane_id == 0) {
      *y_s_ptr = y_s;
      y_s_ptr += stride_ys_t;
    }
  }
 #endif
 }
 #endif
 }  // namespace vllm
@ -568,14 +475,14 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
-void persistent_masked_m_silu_mul_quant(
+void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,              // (E, T, 2*H)
+    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& tokens_per_expert,  // (E)
+    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,                      // (E, T, H) [OUT]
+    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
 #ifndef USE_ROCM
-
+  // This kernel relies heavily on cp.async and fp8 support.
  // This kernel currently only supports H % 128 == 0 and assumes a
  // fixed GROUP_SIZE of 128.
  TORCH_CHECK(input.dtype() == torch::kBFloat16);
@ -584,6 +491,10 @@ void persistent_masked_m_silu_mul_quant(
  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
  TORCH_CHECK(input.size(-1) % 256 == 0);
  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
  using Idx_t = int64_t;
  Idx_t E = input.size(0);
@ -599,54 +510,81 @@ void persistent_masked_m_silu_mul_quant(
  Idx_t stride_ys_t = y_s.stride(1);
  Idx_t stride_ys_g = y_s.stride(2);
-  Idx_t stride_counts_e = tokens_per_expert.stride(0);
+  Idx_t stride_counts_e = counts.stride(0);
  static constexpr int GROUP_SIZE = 128;
  #define KERNEL_FN                                                         \
    if (use_ue8m0) {                                                        \
      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
                                                NUM_PARALLEL_TOKENS, true>  \
          <<<grid, block, 0, stream>>>(                                     \
              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
              stride_counts_e);                                             \
    } else {                                                                \
      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
                                                NUM_PARALLEL_TOKENS, false> \
          <<<grid, block, 0, stream>>>(                                     \
              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
              stride_counts_e);                                             \
    }
  #define KERNEL_CALL_H                                       \
    if (H % (4 * GROUP_SIZE) == 0) {                          \
      static constexpr int NUM_WARPS = 4;                     \
      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
      KERNEL_FN                                               \
    } else {                                                  \
      static constexpr int NUM_WARPS = 1;                     \
      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
      KERNEL_FN                                               \
    }
  #define KERNEL_CALL_TOP_LEVEL                      \
    if (num_parallel_tokens == 1) {                  \
      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 2) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 4) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 8) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 16) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 16; \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 32) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 32; \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 64) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 64; \
      KERNEL_CALL_H                                  \
    }
  Idx_t G;
  dim3 block, grid;
  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
    G = H / Idx_t(group_size * num_warps);
    grid = dim3(E * G, _num_parallel_tokens);
    block = dim3(num_warps * WARP_SIZE);
  };
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
-    static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
+                          "silu_mul_fp8_quant_deep_gemm_kernel",
-    int sms = SILU_V2_BLOCK_COUNT;                                             \
+                          [&] { KERNEL_CALL_TOP_LEVEL });
    static constexpr int max_shared_mem_bytes =                                \
        GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2;                               \
    dim3 grid(sms), block(THREAD_COUNT);                                       \
    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));          \
    VLLM_DISPATCH_FP8_TYPES(                                                   \
        y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
          vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
              <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                  reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
                  reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                  T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
                  stride_ys_g, stride_counts_e);                               \
        });
  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
  if (!use_ue8m0) {
    if (H >= 4096) {
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
    } else {
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
    }
  } else {
    if (H >= 4096) {
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
    } else {
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
    }
  }
 #endif
 }
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -17,32 +17,28 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
                  (128, 64, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 # group_blocks:
@ -63,12 +59,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
@ -98,7 +93,8 @@ def generate_new_kernels():
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
                    group_blocks == 4:
                # HQQ (is_zp_float = true) only supports
                # 4bit quantization and fp16
                is_zp_float_list.append(True)
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -9,23 +9,23 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
 from typing import Optional, Union
 import jinja2
-from vllm_cutlass_library_extension import (
+# yapf conflicts with isort for this block
-    DataType,
+# yapf: disable
-    EpilogueScheduleTag,
+from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
-    EpilogueScheduleType,
+                                            EpilogueScheduleType,
-    MixedInputKernelScheduleType,
+                                            MixedInputKernelScheduleType,
-    TileSchedulerTag,
+                                            TileSchedulerTag,
-    TileSchedulerType,
+                                            TileSchedulerType, VLLMDataType,
-    VLLMDataType,
+                                            VLLMDataTypeNames,
-    VLLMDataTypeNames,
+                                            VLLMDataTypeSize, VLLMDataTypeTag,
-    VLLMDataTypeSize,
+                                            VLLMDataTypeTorchDataTypeTag,
-    VLLMDataTypeTag,
+                                            VLLMDataTypeVLLMScalarTypeTag,
-    VLLMDataTypeTorchDataTypeTag,
+                                            VLLMKernelScheduleTag)
-    VLLMDataTypeVLLMScalarTypeTag,
+
-    VLLMKernelScheduleTag,
+# yapf: enable
 )
 #
 #   Generator templating
@ -258,7 +258,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
 class TypeConfig:
    a: DataType
-    b: DataType | VLLMDataType
+    b: Union[DataType, VLLMDataType]
    b_group_scale: DataType
    b_group_zeropoint: DataType
    b_channel_scale: DataType
@ -279,30 +279,25 @@ class PrepackTypeConfig:
 class ImplConfig:
    types: TypeConfig
    schedules: list[ScheduleConfig]
-    heuristic: list[tuple[str | None, ScheduleConfig]]
+    heuristic: list[tuple[Optional[str], ScheduleConfig]]
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
    tile_shape = (
        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
    )
-    cluster_shape = (
+    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
-        f"{schedule_config.cluster_shape_mnk[0]}"
+                     f"x{schedule_config.cluster_shape_mnk[1]}" +
-        + f"x{schedule_config.cluster_shape_mnk[1]}"
+                     f"x{schedule_config.cluster_shape_mnk[2]}")
-        + f"x{schedule_config.cluster_shape_mnk[2]}"
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
-    )
+        .split("::")[-1]
-    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
+    epilogue_schedule = EpilogueScheduleTag[
-        "::"
+        schedule_config.epilogue_schedule].split("::")[-1]
-    )[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
-    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
+        .split("::")[-1]
        "::"
    )[-1]
    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
-    return (
+    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
-        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+            f"_{epilogue_schedule}_{tile_scheduler}")
        + f"_{epilogue_schedule}_{tile_scheduler}"
    )
 # mostly unique shorter sch_sig
@ -321,24 +316,18 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
 # unique type_name
 def generate_type_signature(kernel_types: TypeConfig):
-    return str(
+    return str("".join([
-        "".join(
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            [
+        for field in fields(TypeConfig)
-                VLLMDataTypeNames[getattr(kernel_types, field.name)]
+    ]))
                for field in fields(TypeConfig)
            ]
        )
    )
 def generate_type_option_name(kernel_types: TypeConfig):
-    return ", ".join(
+    return ", ".join([
-        [
+        f"{field.name.replace('b_', 'with_')+'_type'}=" +
-            f"{field.name.replace('b_', 'with_') + '_type'}="
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
-            for field in fields(TypeConfig)
+    ])
        ]
    )
 def is_power_of_two(n):
@ -346,6 +335,7 @@ def is_power_of_two(n):
 def to_cute_constant(value: list[int]):
    def _to_cute_constant(value: int):
        if is_power_of_two(value):
            return f"_{value}"
@ -360,11 +350,11 @@ def to_cute_constant(value: list[int]):
 def unique_schedules(impl_configs: list[ImplConfig]):
    # Use dict over set for deterministic ordering
-    return list(
+    return list({
-        {
+        sch: None
-            sch: None for impl_config in impl_configs for sch in impl_config.schedules
+        for impl_config in impl_configs
-        }.keys()
+        for sch in impl_config.schedules
-    )
+    }.keys())
 def unsigned_type_with_bitwidth(num_bits):
@ -390,7 +380,7 @@ template_globals = {
    "gen_type_sig": generate_type_signature,
    "unique_schedules": unique_schedules,
    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
-    "gen_type_option_name": generate_type_option_name,
+    "gen_type_option_name": generate_type_option_name
 }
@ -408,28 +398,23 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
    sources = []
-    sources.append(
+    sources.append((
-        (
+        "machete_mm_dispatch",
-            "machete_mm_dispatch",
+        mm_dispatch_template.render(impl_configs=impl_configs),
-            mm_dispatch_template.render(impl_configs=impl_configs),
+    ))
        )
    )
    prepack_types = []
    for impl_config in impl_configs:
-        convert_type = (
+        convert_type = impl_config.types.a \
-            impl_config.types.a
+             if impl_config.types.b_group_scale == DataType.void \
-            if impl_config.types.b_group_scale == DataType.void
+             else impl_config.types.b_group_scale
            else impl_config.types.b_group_scale
        )
        prepack_types.append(
            PrepackTypeConfig(
                a=impl_config.types.a,
                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
                convert=convert_type,
                accumulator=impl_config.types.accumulator,
-            )
+            ))
        )
    def prepacked_type_key(prepack_type: PrepackTypeConfig):
        # For now, we can just use the first accumulator type seen since
@ -445,14 +430,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            unique_prepack_types.append(prepack_type)
            prepack_types_seen.add(key)
-    sources.append(
+    sources.append((
-        (
+        "machete_prepack",
-            "machete_prepack",
+        prepack_dispatch_template.render(types=unique_prepack_types, ),
-            prepack_dispatch_template.render(
+    ))
                types=unique_prepack_types,
            ),
        )
    )
    # Split up impls across files
    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
@ -485,12 +466,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
        curr_impl_in_file += len(files_impls[-1][-1].schedules)
    for part, file_impls in enumerate(files_impls):
-        sources.append(
+        sources.append((
-            (
+            f"machete_mm_impl_part{part+1}",
-                f"machete_mm_impl_part{part + 1}",
+            mm_impl_template.render(impl_configs=file_impls),
-                mm_impl_template.render(impl_configs=file_impls),
+        ))
            )
        )
    return sources
@ -535,7 +514,8 @@ def generate():
    # For now we use the same heuristic for all types
    # Heuristic is currently tuned for H100s
    default_heuristic = [
-        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
+        (cond, ScheduleConfig(*tile_config,
                              **sch_common_params))  # type: ignore
        for cond, tile_config in default_tile_heuristic_config.items()
    ]
@ -561,18 +541,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(GPTQ_kernel_type_configs,
-            GPTQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    AWQ_kernel_type_configs = list(
@ -585,18 +561,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (DataType.u4, DataType.u8)
-        for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(AWQ_kernel_type_configs,
-            AWQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    # TODO: Support W4A8 when ready
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
 __global__ __quickreduce_launch_bounds_two_shot__ static void
 allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
                            int rank, uint8_t** dbuffer_list,
-                            uint32_t data_offset, uint32_t flag_color,
+                            uint32_t data_offset, uint32_t flag_color) {
                            int64_t data_size_per_phase) {
  int block = blockIdx.x;
  int grid = gridDim.x;
  while (block < num_blocks) {
    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
-                         flag_color, data_size_per_phase);
+                         flag_color);
    block += grid;
    flag_color++;
  }
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  } else if (world_size == 4) {                                             \
    using LineCodec = __codec<T, 4>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  } else if (world_size == 8) {                                             \
    using LineCodec = __codec<T, 8>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  }
 enum QuickReduceQuantLevel {
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@ -553,12 +553,13 @@ struct AllReduceTwoshot {
      int const rank,                      // rank index
      uint8_t** __restrict__ buffer_list,  // communication buffers
      uint32_t const data_offset,          // offset to start of the data buffer
-      uint32_t flag_color, int64_t data_size_per_phase) {
+      uint32_t flag_color) {
    // Topology
    int thread = threadIdx.x + threadIdx.y * kWavefront;
    uint8_t* rank_buffer = buffer_list[rank];
    Codec codec(thread, rank);
    int block_id = blockIdx.x;
    int grid_size = gridDim.x;
    // --------------------------------------------------------
    // Read input into registers
    int32x4_t tA[kAtoms];
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
    // rank responsible for this segment.
    uint32_t comm_data0_offset =
        data_offset + block_id * Codec::kTransmittedTileSize;
-    uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
+    uint32_t comm_data1_offset =
        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
-    uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
+    uint32_t comm_flags1_offset =
        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
    for (int r = 0; r < kWorldSize; r++) {
      int32x4_t* send_buffer =
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -44,245 +44,6 @@ __global__ void apply_repetition_penalties_kernel(
  }
 }
 static inline __device__ uint16_t extractBinIdx(float x) {
  union {
    __half h;
    uint16_t u16;
  } tmp;
  tmp.h = __float2half_rn(x);
  tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
  return 511 - (tmp.u16 >> 7);
 }
 template <int kNumThreadsPerBlock = 512>
 static __global__ void topKPerRow(const float* logits, const int* rowStarts,
                                  const int* rowEnds, int* outIndices,
                                  float* outLogits, int stride0, int stride1) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;
  // The top-k width.
  static constexpr int kTopK = 2048;
  // The number of elements per thread for the final top-k sort.
  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
  // The class to sort the elements during the final top-k sort.
  using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                       kNumTopKItemsPerThread, int>;
  // The number of slots for the final pass.
  static constexpr int kNumFinalItems = 3072;
  // The number of elements per thread for the final sort.
  static constexpr int kNumFinalItemsPerThread =
      kNumFinalItems / kNumThreadsPerBlock;
  // The class to sort the elements during the final pass.
  using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                        kNumFinalItemsPerThread, int>;
  // The class to compute the inclusive prefix-sum over the histogram.
  using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
  // Shared memory to compute the block scan.
  __shared__ typename Scan::TempStorage smemScan;
  // The structure to store the final items (for the final pass).
  struct FinalItems {
    // Shared memory to store the indices for the final pass.
    int indices[kNumFinalItems];
    // Shared memory to store the logits for the final pass.
    float logits[kNumFinalItems];
  };
  // Shared memory to compute the block sort.
  __shared__ union {
    FinalItems items;
    typename FinalSort::TempStorage finalSort;
    typename TopKSort::TempStorage topKSort;
  } smemFinal;
  // Shared memory to store the histogram.
  __shared__ int smemHistogram[kNumBins];
  // Shared memory to store the selected indices.
  __shared__ int smemIndices[kTopK];
  // Shared memory to store the selected logits.
  __shared__ float smemLogits[kTopK];
  // Shared memory to store the threshold bin.
  __shared__ int smemThresholdBinIdx[1];
  // Shared memory counter to register the candidates for the final phase.
  __shared__ int smemFinalDstIdx[1];
  // The row computed by this block.
  int rowIdx = blockIdx.x;
  // The range of logits within the row.
  int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
  // The length of the row.
  int rowLen = rowEnd - rowStart;
  // Shortcut if the length of the row is smaller than Top-K. Indices are not
  // sorted by their corresponding logit.
  if (rowLen <= kTopK) {
    for (int rowIt = threadIdx.x; rowIt < rowLen;
         rowIt += kNumThreadsPerBlock) {
      int idx = rowStart + rowIt;
      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
      outLogits[rowIdx * kTopK + rowIt] =
          logits[rowIdx * stride0 + idx * stride1];
    }
    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
         rowIt += kNumThreadsPerBlock) {
      outIndices[rowIdx * kTopK + rowIt] = -1;
      outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
    }
    return;
  }
  // Clear the histogram.
  if (threadIdx.x < kNumBins) {
    smemHistogram[threadIdx.x] = 0;
  }
  // Make sure the histogram is ready.
  __syncthreads();
  // Fetch elements one-by-one.
  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
       rowIt += kNumThreadsPerBlock) {
    uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
    atomicAdd(&smemHistogram[idx], 1);
  }
  // Make sure the histogram is ready.
  __syncthreads();
  // Read the values from SMEM.
  int binCount{0};
  if (threadIdx.x < kNumBins) {
    binCount = smemHistogram[threadIdx.x];
  }
  // Make sure each thread has read its value.
  __syncthreads();
  // Compute the prefix sum.
  int prefixSum{0}, totalSum{0};
  Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
  // Update the histogram with the prefix sums.
  if (threadIdx.x < kNumBins) {
    smemHistogram[threadIdx.x] = prefixSum;
  }
  // Make sure the data is in shared memory.
  __syncthreads();
  // Find the last valid bin.
  if (threadIdx.x < kNumBins) {
    int nextPrefixSum =
        threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
    if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
      smemThresholdBinIdx[0] = threadIdx.x;
    }
  }
  // Clear the counter to store the items for the final phase.
  if (threadIdx.x == 0) {
    smemFinalDstIdx[0] = 0;
  }
  // Make sure the data is in shared memory.
  __syncthreads();
  // The threshold bin.
  int thresholdBinIdx = smemThresholdBinIdx[0];
  // Fetch elements one-by-one and populate the shared memory buffers.
  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
       rowIt += kNumThreadsPerBlock) {
    float logit = logits[rowIdx * stride0 + rowIt * stride1];
    uint16_t idx = extractBinIdx(logit);
    if (idx < thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
      smemLogits[dstIdx] = logit;
      smemIndices[dstIdx] = rowIt;
    } else if (idx == thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
      if (dstIdx < kNumFinalItems) {
        smemFinal.items.logits[dstIdx] = logit;
        smemFinal.items.indices[dstIdx] = rowIt;
      }
    }
  }
  // Make sure the elements are in shared memory.
  __syncthreads();
  // The logits of the elements to be sorted in the final pass.
  float finalLogits[kNumFinalItemsPerThread];
  // The indices of the elements to be sorted in the final pass.
  int finalIndices[kNumFinalItemsPerThread];
 // Init.
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    finalLogits[ii] = -FLT_MAX;
  }
 // Read the elements from SMEM.
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    if (srcIdx < smemFinalDstIdx[0]) {
      finalLogits[ii] = smemFinal.items.logits[srcIdx];
      finalIndices[ii] = smemFinal.items.indices[srcIdx];
    }
  }
  // Make sure the shared memory has been read.
  __syncthreads();
  // Sort the elements.
  FinalSort(smemFinal.finalSort)
      .SortDescendingBlockedToStriped(finalLogits, finalIndices);
  // Copy the data back to the shared memory storage.
  int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    int dstIdx = baseIdx + srcIdx;
    if (dstIdx < kTopK) {
      smemLogits[dstIdx] = finalLogits[ii];
      smemIndices[dstIdx] = finalIndices[ii];
    }
  }
  // Make sure the data is in shared memory.
  __syncthreads();
  // The topK logits.
  float topKLogits[kNumTopKItemsPerThread];
  // The topK indices.
  int topKIndices[kNumTopKItemsPerThread];
 // Load from shared memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
    topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
  }
  // Sort the elements.
  TopKSort(smemFinal.topKSort)
      .SortDescendingBlockedToStriped(topKLogits, topKIndices);
 // Store to global memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
    outIndices[offset] = topKIndices[ii] - rowStart;
    outLogits[offset] = topKLogits[ii];
  }
 }
 }  // namespace vllm
 void apply_repetition_penalties_(
@ -324,20 +85,4 @@ void apply_repetition_penalties_(
                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
                tile_size);
      });
-}
+}
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
                   torch::Tensor& values, int64_t numRows, int64_t stride0,
                   int64_t stride1) {
  // Compute the results on the device.
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  vllm::topKPerRow<kNumThreadsPerBlock>
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
          values.data_ptr<float>(), static_cast<int>(stride0),
          static_cast<int>(stride1));
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -33,11 +33,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
  ops.def(
-      "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
+      "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
-      "y_q, Tensor! y_s,"
+      "y_q, Tensor! y_s, int group_size, "
-      "bool use_ue8m0) -> ()");
+      "bool use_ue8m0, int num_parallel_tokens) -> ()");
-  ops.impl("persistent_masked_m_silu_mul_quant", torch::kCUDA,
+  ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
-           &persistent_masked_m_silu_mul_quant);
+           &silu_mul_fp8_quant_deep_gemm_cuda);
  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@ -188,13 +188,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("apply_repetition_penalties_", torch::kCUDA,
           &apply_repetition_penalties_);
  // Optimized top-k per row operation
  ops.def(
      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
      "Tensor! indices, Tensor! values, int numRows, int stride0, "
      "int stride1) -> ()");
  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@ -727,12 +720,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "int quant_block_size, str kv_cache_dtype) -> ()");
  cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
                 &indexer_k_quant_and_cache);
  cache_ops.def(
      "cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
      "dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
  cache_ops.impl("cp_gather_indexer_k_quant_cache", torch::kCUDA,
                 &cp_gather_indexer_k_quant_cache);
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# Important: We build with an old version of Ubuntu to maintain broad
+# Important: We build with an old version of Ubuntu to maintain broad 
 # compatibility with other Linux OSes. The main reason for this is that the
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=500
+ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -356,14 +356,75 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
    uv pip install --system dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-# Install FlashInfer pre-compiled kernel cache and binaries
+# If we need to build FlashInfer wheel before its release:
-# https://docs.flashinfer.ai/installation.html
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-RUN --mount=type=cache,target=/root/.cache/uv \
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
-    uv pip install --system flashinfer-cubin==0.4.1 \
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-    && uv pip install --system flashinfer-jit-cache==0.4.1 \
+# $ cd flashinfer
-        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+# $ git checkout v0.2.6.post1
-    && flashinfer show-config
+# $ python -m flashinfer.aot
 # $ python -m build --no-isolation --wheel
 # $ ls -la dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
 ARG FLASHINFER_GIT_REF="v0.3.1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
 ARG FLASHINFER_AOT_COMPILE=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
        ${FLASHINFER_GIT_REPO} flashinfer
    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
    if [[ "${CUDA_VERSION}" == 11.* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
    else
        # CUDA 12.8+ supports 10.0a and 12.0
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
    fi
    pushd flashinfer
        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
                # Download pre-compiled cubins
                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
            fi
        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
            # Build AOT kernels
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                python3 -m flashinfer.aot
            # Install with no-build-isolation since we already built AOT kernels
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                uv pip install --system --no-build-isolation . \
                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
            # Download pre-compiled cubins
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
        else
            echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
            uv pip install --system . \
                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
        fi
    popd
    rm -rf flashinfer
 BASH
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
@ -400,7 +461,7 @@ RUN set -eux; \
 # Install EP kernels(pplx-kernels and DeepEP)
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
 ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
+RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
    && bash install_python_libraries.sh
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
@ -481,7 +542,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.14.0'
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`