Use uv to install python in Dockerfile

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-02 11:05:47 -04:00
1703 changed files with 104329 additions and 141319 deletions
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -368,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )
    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi
-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
      -m vllm.entrypoints.openai.api_server \
      $server_args"
    # run the server
@ -454,6 +455,11 @@ main() {
  fi
  check_hf_token
  # Set to v1 to run v1 benchmark
  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
    export VLLM_USE_V1=1
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -48,7 +48,7 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -150,16 +150,11 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      - "docker push vllm/vllm-openai:nightly-aarch64"
      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest push vllm/vllm-openai:nightly"
      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -168,4 +163,3 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,41 +8,20 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
-# Get DockerHub credentials from environment
+# Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 if [ -z "$DOCKERHUB_USERNAME" ]; then
    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
    exit 1
 fi
 # Get DockerHub bearer token
 echo "Getting DockerHub bearer token..."
 set +x
 BEARER_TOKEN=$(curl -s -X POST \
    -H "Content-Type: application/json" \
    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
    "https://hub.docker.com/v2/users/login" | jq -r '.token')
 set -x
 if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
    echo "Error: Failed to get DockerHub bearer token"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
-        set +x
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        set -x
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
    set -x
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
 #!/bin/bash
 # This script build the Ascend NPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Base ubuntu image with basic ascend development libraries and python installed
 VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
 CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
 TEST_RUN_CONFIG_FILE="vllm_test.cfg"
 VLLM_ASCEND_TMP_DIR=
 # Get the test run configuration file from the vllm-ascend repository
 fetch_vllm_test_cfg() {
    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
    cleanup() {
        rm -rf "${VLLM_ASCEND_TMP_DIR}"
    }
    trap cleanup EXIT
    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
        exit 1
    fi
    # If the file already exists locally, just overwrite it
    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
    rm -rf "${VLLM_ASCEND_TMP_DIR}"
    trap - EXIT
 }
 # Downloads test run configuration file from a remote URL.
 # Loads the configuration into the current script environment.
 get_config() {
    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
 }
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
 if [ $? -ne 0 ]; then
  exit 1
 fi
 image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
 container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
 agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
 mkdir -p ${builder_cache_dir}
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
    apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 # Install for pytest to make the docker build cache layer always valid
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install pytest>=6.0  modelscope
 WORKDIR /workspace/vllm
 # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements/common.txt
 COPY . .
 # Install vLLM
 RUN --mount=type=cache,target=/root/.cache/pip \
    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton
 # Install vllm-ascend
 WORKDIR /workspace
 ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
 ARG VLLM_ASCEND_TAG=main
 RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
 # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r /workspace/vllm-ascend/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 ENV VLLM_USE_MODELSCOPE=True
 WORKDIR /workspace/vllm-ascend
 CMD ["/bin/bash"]
 EOF
 # Setup cleanup
 remove_docker_container() {
  docker rm -f "${container_name}" || true;
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
 #   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
        index="${BASH_REMATCH[1]}"
        cards_num="${BASH_REMATCH[2]}"
    else
        echo "parse error" >&2
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
 devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
 mkdir -p ${model_cache_dir}
 docker run \
    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    bash -c '
    set -e
    pytest -v -s tests/e2e/vllm_interface/
 '
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -64,9 +64,10 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
-
+export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
 export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 echo "--- Hardware Information ---"
 # tpu-info
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=8.7
+EXPECTED_THROUGHPUT=10.0
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-vllm serve $MODEL \
+VLLM_USE_V1=1 vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -296,7 +296,6 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
@ -318,7 +317,7 @@ steps:
  no_gpu: true
  commands:
    # split the test to avoid interference
-    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@ -398,8 +397,9 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
@ -431,9 +431,8 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@ -477,7 +476,6 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba
@ -828,20 +826,18 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval
+- label: GPT-OSS Eval (Blackwell)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # run on nightlies
+  optional: true # disable while debugging
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@ -868,16 +864,6 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -1094,8 +1080,6 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -23,7 +23,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -11,13 +10,10 @@ pull_request_rules:
    label:
      add:
        - documentation
    comment:
      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -34,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -51,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -61,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -77,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -91,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -103,7 +94,6 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -117,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -132,7 +121,6 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -154,7 +142,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -175,7 +162,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -195,7 +181,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -211,7 +196,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -224,7 +208,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -240,7 +223,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -255,7 +237,6 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
    - label != stale
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
@ -274,9 +255,8 @@ pull_request_rules:
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
+      - conflict
-    - conflict
+      - -closed
    - -closed
  actions:
    label:
      add:
@ -290,8 +270,6 @@ pull_request_rules:
 - name: assign reviewer for tensorizer changes
  conditions:
    - label != stale
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
@ -303,7 +281,6 @@ pull_request_rules:
 - name: assign reviewer for modelopt changes
  conditions:
    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -318,8 +295,8 @@ pull_request_rules:
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
+      - -conflict
-    - -closed
+      - -closed
  actions:
    label:
      remove:
@ -328,7 +305,6 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,18 +6,30 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
+- repo: https://github.com/google/yapf
-  rev: v0.14.0
+  rev: v0.43.0
  hooks:
-  - id: ruff-check
+  - id: yapf
    args: [--in-place, --verbose]
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.35.5
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v21.1.2
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -34,7 +46,7 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.9.1
+  rev: 0.6.17
  hooks:
    - id: pip-compile
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
@ -55,6 +67,11 @@ repos:
      types_or: [python, pyi]
      require_serial: true
      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
    entry: python tools/pre_commit/mypy.py 1 "3.9"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@ -70,11 +87,6 @@ repos:
    entry: python tools/pre_commit/mypy.py 1 "3.12"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.13
    entry: python tools/pre_commit/mypy.py 1 "3.13"
    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
    entry: tools/shellcheck.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
-  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
@ -314,13 +314,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
-    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+    "csrc/quantization/fp8/per_token_group_quant.cu")
    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -424,11 +423,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -459,9 +458,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -493,9 +492,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -526,7 +525,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -649,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -668,12 +667,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -698,7 +697,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -721,7 +720,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -1007,7 +1006,6 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
    include(cmake/external_projects/qutlass.cmake)
    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)
--- a/README.md
+++ b/README.md
@ -149,7 +149,6 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
 - Volcengine
 Slack Sponsor: Anyscale
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -74,7 +74,7 @@ start_server() {
    local vllm_log=$4
    local profile_dir=$5
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    # Define the common arguments as a bash array.
    # Each argument and its value are separate elements.
@ -96,11 +96,11 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
-        VLLM_SERVER_DEV_MODE=1 \
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
    local server_pid=$!
@ -139,7 +139,7 @@ run_benchmark() {
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +232,7 @@ run_benchmark() {
    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-    pkill -if "vllm serve" || true
+    pkill -if vllm
    sleep 10
    echo "===================="
    return 0
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 else
    echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
-pkill -if "vllm serve" || true
+pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 from benchmark_utils import TimeCollector
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -5,9 +5,9 @@ import time
 from unittest import mock
 import numpy as np
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 from benchmark_utils import TimeCollector
 from vllm.config import (
    CacheConfig,
    DeviceConfig,
@ -164,7 +164,7 @@ def invoke_main() -> None:
    )
    parser.add_argument(
        "--batched", action="store_true", help="consider time to prepare batch"
-    )
+    )  # noqa: E501
    parser.add_argument(
        "--num-iteration",
        type=int,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -37,13 +37,14 @@ from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -909,13 +910,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv
@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/kernels/bench_mxfp4_qutlass.py
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@ -1,191 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import argparse
 import copy
 import itertools
 import torch
 from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 from weight_shapes import WEIGHT_SHAPES
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.triton_utils import triton
 PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "mxfp4": dict(no_a_quant=False, enabled=True),
    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
 }
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
    return (
        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
        * group_size**-0.5
    )
 def _quant_weight_mxfp4(
    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
 ):
    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
        b, forward_hadamard_matrix, method="abs_max"
    )
    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
    return weight_hf_e2m1, weight_hf_scale_block
 def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
        b, forward_hadamard_matrix, device
    )
    alpha = torch.tensor([1.0], device="cuda")
    if cfg["no_a_quant"]:
        # Pre-quantize activation
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
            a, forward_hadamard_matrix, method="abs_max"
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
        def run():
            return matmul_mxf4_bf16_tn(
                input_hf_e2m1,
                weight_hf_e2m1,
                input_hf_scale_block,
                weight_hf_scale_block,
                alpha,
            )
        return run
    # Quantize activation on-the-fly
    def run():
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
            a, forward_hadamard_matrix, method="abs_max"
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
        return matmul_mxf4_bf16_tn(
            input_hf_e2m1,
            weight_hf_e2m1,
            input_hf_scale_block,
            weight_hf_scale_block,
            alpha,
        )
    return run
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[
            1,
            4,
            8,
            16,
            32,
            64,
            128,
            256,
            512,
            1024,
            2048,
            4096,
            8192,
            16384,
            24576,
            32768,
        ],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs MXFP4 GEMMs",
        args={},
    )
 )
 def benchmark(batch_size, provider, N, K, had_size):
    M = batch_size
    device = "cuda"
    dtype = torch.bfloat16
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)
    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
        )
    else:
        cfg = PROVIDER_CFGS[provider]
        run_quant = build_mxfp4_runner(
            cfg, a, b, forward_hadamard_matrix, dtype, device
        )
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), rep=200, quantiles=quantiles
        )
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 def prepare_shapes(args):
    out = []
    for model, tp_size in itertools.product(args.models, args.tp_sizes):
        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
            KN[tp_dim] //= tp_size
            KN.append(model)
            out.append(KN)
    return out
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()
    for K, N, model in prepare_shapes(args):
        for had_size in [32, 64, 128]:
            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
            benchmark.run(
                print_data=True,
                show_plots=True,
                save_path=f"bench_mxfp4_res_n{N}_k{K}",
                N=N,
                K=K,
                had_size=had_size,
            )
    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_nvfp4_qutlass.py
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@ -1,207 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import argparse
 import copy
 import itertools
 import torch
 from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 from vllm._custom_ops import fusedQuantizeNv
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.triton_utils import triton
 PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
 }
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
    return (
        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
        * group_size**-0.5
    )
 def _quant_weight_nvfp4(
    b: torch.Tensor,
    forward_hadamard_matrix: torch.Tensor,
    global_scale: torch.Tensor,
    device: str,
    M: int,
    N: int,
    K: int,
 ):
    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
        b, forward_hadamard_matrix, global_scale
    )
    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
        -1, K // 16
    )
    return weight_hf_e2m1, weight_hf_scale_block
 def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
    alpha = torch.tensor([1.0], device="cuda")
    global_scale = torch.tensor([1.0], device="cuda")
    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
        b, forward_hadamard_matrix, global_scale, device, M, N, K
    )
    if cfg["no_a_quant"]:
        # Pre-quantize activation
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
            a, forward_hadamard_matrix, global_scale
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
            -1, K // 16
        )
        def run():
            return ops.cutlass_scaled_fp4_mm(
                input_hf_e2m1,
                weight_hf_e2m1,
                input_hf_scale_block,
                weight_hf_scale_block,
                alpha,
                torch.bfloat16,
            )
        return run
    # Quantize activation on-the-fly
    def run():
        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
            a, forward_hadamard_matrix, global_scale
        )
        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
            -1, K // 16
        )
        return ops.cutlass_scaled_fp4_mm(
            input_hf_e2m1,
            weight_hf_e2m1,
            input_hf_scale_block,
            weight_hf_scale_block,
            alpha,
            torch.bfloat16,
        )
    return run
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[
            1,
            4,
            8,
            16,
            32,
            64,
            128,
            256,
            512,
            1024,
            2048,
            4096,
            8192,
            16384,
            24576,
            32768,
        ],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs NVFP4 GEMMs",
        args={},
    )
 )
 def benchmark(batch_size, provider, N, K, had_size):
    M = batch_size
    device = "cuda"
    dtype = torch.bfloat16
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)
    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
        )
    else:
        cfg = PROVIDER_CFGS[provider]
        run_quant = build_nvfp4_runner(
            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
        )
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), rep=200, quantiles=quantiles
        )
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 def prepare_shapes(args):
    out = []
    for model, tp_size in itertools.product(args.models, args.tp_sizes):
        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
            KN[tp_dim] //= tp_size
            KN.append(model)
            out.append(KN)
    return out
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()
    for K, N, model in prepare_shapes(args):
        for had_size in [16, 32, 64, 128]:
            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
            benchmark.run(
                print_data=True,
                show_plots=True,
                save_path=f"bench_nvfp4_res_n{N}_k{K}",
                N=N,
                K=K,
                had_size=had_size,
            )
    print("Benchmark finished!")
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -579,12 +579,10 @@ def main(args: argparse.Namespace):
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        hidden_size = config.hidden_size
    elif config.architectures[0] == "JambaForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] in (
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
@ -594,7 +592,6 @@ def main(args: argparse.Namespace):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] in (
        "Qwen2MoeForCausalLM",
        "Qwen3MoeForCausalLM",
@ -603,18 +600,10 @@ def main(args: argparse.Namespace):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
        text_config = config.get_text_config()
        E = text_config.num_experts
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
    else:
        # Support for llama4
        config = config.get_text_config()
@ -622,7 +611,6 @@ def main(args: argparse.Namespace):
        E = config.num_local_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
@ -631,6 +619,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    hidden_size = config.hidden_size
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,174 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
 import torch
 from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
 logger = init_logger(__name__)
@torch.inference_mode()
 def run_benchmark(
    num_tokens: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
    kv_cache_dtype: str,
    num_iters: int,
    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
    current_platform.seed_everything(42)
    torch.set_default_device(device)
    # create random key / value tensors [T, H, D].
    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
    value = torch.randn_like(key)
    # prepare the slot mapping.
    # each token is assigned a unique slot in the KV-cache.
    num_slots = block_size * num_blocks
    if num_tokens > num_slots:
        raise ValueError("num_tokens cannot exceed the total number of cache slots")
    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
    key_caches, value_caches = create_kv_caches_with_random(
        num_blocks,
        block_size,
        1,  # num_layers
        num_heads,
        head_size,
        kv_cache_dtype,
        dtype,
        device=device,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
    # to free unused memory
    del key_caches, value_caches
    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)
    function_under_test = lambda: ops.reshape_and_cache(
        key,  # noqa: F821
        value,  # noqa: F821
        key_cache,  # noqa: F821
        value_cache,  # noqa: F821
        slot_mapping,  # noqa: F821
        kv_cache_dtype,
        k_scale,
        v_scale,
    )
    if benchmark_mode == "cudagraph":
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
        torch.cuda.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
    # warm-up
    run_cuda_benchmark(3)
    lat = run_cuda_benchmark(num_iters)
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
    torch.cuda.empty_cache()
    return lat
 def main(args):
    rows = []
    for exp in range(1, 17):
        n_tok = 2**exp
        lat = run_benchmark(
            num_tokens=n_tok,
            num_heads=args.num_heads,
            head_size=args.head_size,
            block_size=args.block_size,
            num_blocks=args.num_blocks,
            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
            kv_cache_dtype=args.kv_cache_dtype,
            num_iters=args.iters,
            benchmark_mode=args.mode,
            device="cuda",
        )
        rows.append([n_tok, lat * 1e6])  # convert to microseconds
    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser.add_argument("--num-heads", type=int, default=128)
    parser.add_argument(
        "--head-size",
        type=int,
        choices=[64, 80, 96, 112, 120, 128, 192, 256],
        default=128,
    )
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--num-blocks", type=int, default=128 * 128)
    parser.add_argument(
        "--dtype",
        type=str,
        choices=["half", "bfloat16", "float"],
        default="bfloat16",
    )
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
        choices=["auto", "fp8"],
        default="auto",
    )
    parser.add_argument("--iters", type=int, default=200)
    parser.add_argument(
        "--mode",
        type=str,
        choices=["cudagraph", "no_graph"],
        default="cudagraph",
    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,19 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Comprehensive 3-way SiLU Benchmark Suite
 This benchmark compares three SiLU implementations:
 1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
 2. Triton Kernel - Triton-based implementation
 The suite generates detailed performance comparisons including:
 - Memory bandwidth utilization
 - Speedup ratios (baseline vs optimized implementations)
 - Performance across different expert configurations and token distributions
 """
 from collections.abc import Callable
 import matplotlib.pyplot as plt
@ -21,7 +7,7 @@ import numpy as np
 import torch
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    persistent_masked_m_silu_mul_quant,
+    silu_mul_fp8_quant_deep_gemm_cuda,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@ -108,7 +94,6 @@ def silu_mul_fp8_quant_deep_gemm_triton(
    num_parallel_tokens,
    group_size: int = 128,
    eps: float = 1e-10,
    expert_offsets: torch.Tensor = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -189,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
 # Parse generation strategies
-strategies = ["random_imbalanced", "uniform", "max_t"]
+strategies = ["uniform", "max_t", "first_t"]
 def benchmark(
@ -210,27 +195,15 @@ def benchmark(
        current_platform.seed_everything(42 + seed_offset)
        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
-        if gen_strategy == "random_imbalanced":
+        if gen_strategy == "uniform":
-
+            r = torch.rand(size=(E,), device="cuda")
            def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
                mean = total_tokens // n_e
                min_max = mean // ratio
                e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
                e[0] = min_max
                r = torch.rand(size=(E - 1,))
                r /= r.sum()
                r *= total_tokens - min_max
                r = r.round().long()
                e[1:] = r.to(device=device)
                return e
            tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
        elif gen_strategy == "uniform":
            r = torch.rand(size=(E,))
            r /= r.sum()
            r *= total_tokens
-            r = r.round().long()
+            tokens_per_expert = r.int()
-            tokens_per_expert = r
+            tokens_per_expert = torch.minimum(
                tokens_per_expert,
                torch.ones((E,), device=r.device, dtype=torch.int) * T,
            )
        elif gen_strategy == "max_t":
            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
            tokens_per_expert.fill_(total_tokens / E)
@ -308,34 +281,40 @@ def benchmark(
 def create_comparison_plot(
-    ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
+    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
 ):
-    fig, ax = plt.subplots(1, 1, figsize=(18, 6))
+    """Create a comparison plot for a specific generation strategy"""
    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
    # Configure x-axis positions
    x = np.arange(len(config_labels))
-    width = 0.25
+    width = 0.35
    # Execution Time plot (lower is better)
    ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
    ax.bar(
-        x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
+        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
    )
    ax.bar(
        x + width / 2,
        baseline_times,
        width,
        label="Baseline",
        alpha=0.8,
        color="orange",
    )
-    # Add speedup labels over each bar trio
+    # Add speedup labels over each bar pair
    for i in range(len(x)):
-        triton_v2_speedup = ratios[i][1]  # triton/v2
+        speedup = ratio[i]
-        max_height = max(silu_v2_times[i], triton_times[i])
+        max_height = max(cuda_times[i], baseline_times[i])
        # Triton/V2 speedup
        ax.text(
-            x[i] + width / 2,
+            x[i],
            max_height + max_height * 0.02,
-            f"{triton_v2_speedup:.2f}x",
+            f"{speedup:.2f}x",
            ha="center",
            va="bottom",
            fontweight="bold",
-            fontsize=8,
+            fontsize=9,
        )
    ax.set_xlabel("Configuration")
@ -353,75 +332,56 @@ def create_comparison_plot(
 def create_combined_plot(all_results):
    """Create a combined plot with all strategies in one PNG"""
    num_strategies = len(all_results)
-    fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
    if num_strategies == 1:
        axes = [axes]
    for idx, (
        strategy_name,
-        all_ratios,
+        ratio,
-        all_silu_v2_results,
+        cuda_times,
-        all_triton_results,
+        baseline_times,
        config_labels,
        config_x_axis,
    ) in enumerate(all_results):
        ax = axes[idx]
        # Flatten the nested results to get bandwidth percentages for plotting
        silu_v2_bandwidths = []
        triton_bandwidths = []
        flat_ratios = []
        for config_results in all_silu_v2_results:
            for result in config_results:
                silu_v2_bandwidths.append(result[3])  # bandwidth percentage
        for config_results in all_triton_results:
            for result in config_results:
                triton_bandwidths.append(result[3])  # bandwidth percentage
        for config_ratios in all_ratios:
            for ratio in config_ratios:
                flat_ratios.append(ratio)
        # Configure x-axis positions
        x = np.arange(len(config_labels))
-        width = 0.25
+        width = 0.35
-        # Bandwidth utilization plot (higher is better)
+        # Execution Time plot (lower is better)
        ax.bar(
-            x,
+            x - width / 2,
-            silu_v2_bandwidths,
+            cuda_times,
            width,
-            label="SiLU V2 (CUDA)",
+            label="CUDA Kernel",
            alpha=0.8,
            color="blue",
        )
        ax.bar(
-            x + width,
+            x + width / 2,
-            triton_bandwidths,
+            baseline_times,
            width,
-            label="Triton Kernel",
+            label="Baseline",
            alpha=0.8,
-            color="green",
+            color="orange",
        )
-        # Add speedup labels over each bar trio
+        # Add speedup labels over each bar pair
        for i in range(len(x)):
-            triton_v2_speedup = flat_ratios[i]  # triton/v2
+            speedup = ratio[i]
-            max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+            max_height = max(cuda_times[i], baseline_times[i])
            # Triton/V2 speedup
            ax.text(
-                x[i] + width / 2,
+                x[i],
                max_height + max_height * 0.02,
-                f"{triton_v2_speedup:.2f}x",
+                f"{speedup:.2f}x",
                ha="center",
                va="bottom",
                fontweight="bold",
-                fontsize=8,
+                fontsize=9,
            )
        ax.set_xlabel("Configuration")
@ -435,7 +395,7 @@ def create_combined_plot(all_results):
        ax.grid(True, alpha=0.3)
    plt.tight_layout()
-    filename = "silu_benchmark_combined_3way.png"
+    filename = "../../silu_bench/silu_benchmark_combined.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()
@ -445,9 +405,7 @@ def create_combined_plot(all_results):
 outer_dim = 7168
 configs = [
    # DeepSeekV3 Configs
    # (1, 56, 7168),
    (8, 1024, 7168),
    # (32, 56, 7168),
    # DeepSeekV3 Configs
    (32, 1024, 7168),
    # DeepSeekV3 Configs
@ -459,7 +417,6 @@ num_warmups = 20
 strategy_descriptions = {
    "uniform": "Uniform Random",
    "random_imbalanced": "Imbalanced Random",
    "max_t": "Even Assignment",
    "first_t": "experts[0] = T, experts[1:] = 0",
 }
@ -476,31 +433,28 @@ for id, strategy in enumerate(strategies):
    print(f"Testing strategy: {strategy_descriptions[strategy]}")
    print(f"{'=' * 60}")
-    # Collect benchmark data for all three algorithms
+    # Collect benchmark data for both algorithms
    config_labels = []
    config_x_axis = []
-    all_silu_v2_results = []
+    all_cuda_results = []
-    all_triton_results = []
+    all_baseline_results = []
    all_ratios = []
    for E, T, H in configs:
-        total_tokens_config = []
+        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
        for i in [8, 16, 32, 64, 128, 256, 512]:
            if i <= T:
                total_tokens_config.append(i * E)
        config_x_axis.append(total_tokens_config)
-        silu_v2_results = []
+        cuda_results = []
-        triton_results = []
+        baseline_results = []
        ratios = []
        for total_tokens in total_tokens_config:
            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
            config_labels.append(config_label)
-            # SiLU V2 (CUDA kernel) results
+            # CUDA kernel results
-            time_ms_silu_v2, gflops, gbps, perc = benchmark(
+            time_ms_cuda, gflops, gbps, perc = benchmark(
-                persistent_masked_m_silu_mul_quant,
+                silu_mul_fp8_quant_deep_gemm_cuda,
                E,
                T,
                H,
@ -509,9 +463,9 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
+            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
-            # Triton kernel results
+            # Baseline results
            time_ms_triton, gflops, gbps, perc = benchmark(
                silu_mul_fp8_quant_deep_gemm_triton,
                E,
@ -522,20 +476,12 @@ for id, strategy in enumerate(strategies):
                num_warmups=num_warmups,
                gen_strategy=strategy,
            )
-            triton_results.append((time_ms_triton, gflops, gbps, perc))
+            baseline_results.append((time_ms_triton, gflops, gbps, perc))
            ratios.append(time_ms_triton / time_ms_cuda)
-            # Calculate speedup ratios (triton baseline / implementation)
+            print(f"Completed: {config_label}")
-            triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+        all_cuda_results.append(cuda_results)
-            ratios.append(triton_v2_ratio)
+        all_baseline_results.append(baseline_results)
            print(
                f"Completed: {config_label}:"
                f" V2: {time_ms_silu_v2:.3f}ms,"
                f" Triton: {time_ms_triton:.3f}ms"
            )
        all_silu_v2_results.append(silu_v2_results)
        all_triton_results.append(triton_results)
        all_ratios.append(ratios)
    # Store results for combined plotting
@ -543,8 +489,8 @@ for id, strategy in enumerate(strategies):
        (
            strategy_descriptions[strategy],
            all_ratios,
-            all_silu_v2_results,
+            all_cuda_results,
-            all_triton_results,
+            all_baseline_results,
            config_labels,
            config_x_axis,
        )
@ -552,18 +498,15 @@ for id, strategy in enumerate(strategies):
    # Print summary table for this strategy
    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
-    print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
-    print("-" * 90)
+    print("-" * 60)
    for i, (E, T, H) in enumerate(configs):
-        # Get the first result for each config (simplifying for summary)
+        speedup = baseline_results[i][0] / cuda_results[i][0]
        v2_time = silu_v2_results[i][0]
        triton_time = triton_results[i][0]
        triton_v2_speedup = triton_time / v2_time
        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
        print(
-            f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
-            f"{triton_v2_speedup:8.2f}x"
+            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
        )
@ -571,14 +514,15 @@ def create_total_tokens_plot(all_results):
    num_strategies = len(all_results)
    num_configs = len(configs)
    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
    fig, axs = plt.subplots(
-        num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
+        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
    )
    # Add main title to the entire figure
    fig.suptitle(
-        "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
-        fontsize=18,
+        fontsize=16,
        fontweight="bold",
        y=0.98,
    )
@ -595,8 +539,8 @@ def create_total_tokens_plot(all_results):
        (
            strategy_name,
            all_ratios,
-            all_silu_v2_results,
+            all_cuda_results,
-            all_triton_results,
+            all_baseline_results,
            config_labels,
            config_x_axis,
        ) = result
@ -611,54 +555,42 @@ def create_total_tokens_plot(all_results):
            ratios = all_ratios[config_idx]
            total_tokens_values = config_x_axis[config_idx]
-            # Extract speedup ratios
+            # Extract CUDA and Triton bandwidth percentages
-            triton_v2_ratios = [ratio for ratio in ratios]
+            cuda_bandwidth_percentages = [
-
+                result[3] for result in all_cuda_results[config_idx]
            # Extract bandwidth percentages for all implementations
            v2_bandwidth_percentages = [
                result[3] for result in all_silu_v2_results[config_idx]
            ]
            triton_bandwidth_percentages = [
-                result[3] for result in all_triton_results[config_idx]
+                result[3] for result in all_baseline_results[config_idx]
            ]
            # Plot speedup ratios vs total tokens (left plot)
            ax_speedup.plot(
-                total_tokens_values,
+                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
                triton_v2_ratios,
                "go-",
                linewidth=3,
                markersize=8,
                label="Triton/V2 Speedup",
            )
            ax_speedup.set_title(
-                f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
+                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
                fontsize=12,
                fontweight="bold",
            )
            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
            ax_speedup.legend(prop={"weight": "bold"})
            ax_speedup.grid(True, alpha=0.3)
            # Plot bandwidth utilization (right plot)
            ax_bandwidth.plot(
                total_tokens_values,
-                v2_bandwidth_percentages,
+                cuda_bandwidth_percentages,
-                "o-",
+                "ro-",
                linewidth=3,
                markersize=8,
-                label="SiLU V2",
+                label="CUDA",
                color="blue",
            )
            ax_bandwidth.plot(
                total_tokens_values,
                triton_bandwidth_percentages,
-                "o-",
+                "go-",
                linewidth=3,
                markersize=8,
                label="Triton",
                color="green",
            )
            ax_bandwidth.set_title(
                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -686,12 +618,38 @@ def create_total_tokens_plot(all_results):
                for label in ax.get_xticklabels() + ax.get_yticklabels():
                    label.set_fontweight("bold")
-            # Add value labels on Triton/V2 speedup points
+            # Add value labels on speedup points
-            for x, y in zip(total_tokens_values, triton_v2_ratios):
+            for x, y in zip(total_tokens_values, ratios):
                ax_speedup.annotate(
                    f"{y:.2f}x",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, 12),
                    ha="center",
                    fontsize=10,
                    fontweight="bold",
                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
                )
            # Add value labels on CUDA bandwidth points
            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
                ax_bandwidth.annotate(
                    f"{y:.1f}%",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, 12),
                    ha="center",
                    fontsize=9,
                    fontweight="bold",
                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
                )
            # Add value labels on Triton bandwidth points
            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
                ax_bandwidth.annotate(
                    f"{y:.1f}%",
                    (x, y),
                    textcoords="offset points",
                    xytext=(0, -15),
                    ha="center",
                    fontsize=9,
@ -701,20 +659,17 @@ def create_total_tokens_plot(all_results):
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)  # Make room for main title
-    filename = "silu_benchmark_total_tokens_3way.png"
+    filename = "silu_benchmark_total_tokens.png"
    plt.savefig(filename, dpi=300, bbox_inches="tight")
    plt.show()
    return filename
-# Create comprehensive 3-way comparison plots
+# Create combined plot with all strategies
-combined_plot_filename = create_combined_plot(all_results)
+combined_plot_filename = create_total_tokens_plot(all_results)
 total_tokens_plot_filename = create_total_tokens_plot(all_results)
-print(f"\n{'=' * 80}")
+print(f"\n{'=' * 60}")
-print("3-Way Benchmark Suite Complete!")
+print("Benchmark Complete!")
-print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated combined plot: {combined_plot_filename}")
-print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print(f"{'=' * 60}")
 print("Compared: SiLU V2 (CUDA), and Triton implementations")
 print(f"{'=' * 80}")
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -14,7 +14,7 @@ import torch
 from tqdm import tqdm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_triton_block_scaled_mm,
+    _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
@ -83,7 +83,7 @@ def w8a8_block_matmul(
        )
    if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_triton_block_scaled_mm
+        kernel = _w8a8_block_fp8_matmul
    else:
        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
@ -8,7 +9,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
@ -19,21 +20,19 @@ from vllm.utils.deep_gemm import (
 )
-def benchmark_shape(
+def benchmark_shape(m: int,
-    m: int,
+                    n: int,
-    n: int,
+                    k: int,
-    k: int,
+                    warmup: int = 100,
-    warmup: int = 100,
+                    repeat: int = 10000,
-    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
    verbose: bool = False,
 ) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
    # Create test tensors
-    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
-    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
    # Reference result in BF16
    torch.cuda.synchronize()
@ -50,39 +49,34 @@ def benchmark_shape(
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True
+        A, block_size[1], column_major_scales=True)
    )
    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        fp8_gemm_nt(
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
-            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+                                       (B_deepgemm, B_scale_deepgemm),
-        )
+                                       C_deepgemm)
        return C_deepgemm
    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(
+        return w8a8_block_fp8_matmul(A_vllm,
-            A_vllm,
+                                     B_vllm,
-            B_vllm,
+                                     A_scale_vllm,
-            A_scale_vllm,
+                                     B_scale_vllm,
-            B_scale_vllm,
+                                     block_size,
-            block_size,
+                                     output_dtype=torch.bfloat16)
            output_dtype=torch.bfloat16,
        )
    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
-            A_vllm_cutlass,
+                                     B_vllm.T,
-            B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
-            scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
-            scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
            out_dtype=torch.bfloat16,
        )
    # Run correctness check first
    if verbose:
@ -99,23 +93,26 @@ def benchmark_shape(
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print(
+        print("vLLM Triton vs DeepGEMM difference: "
-            "vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
-            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        print("vLLM CUTLASS vs DeepGEMM difference: "
-        )
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
        print(
            "vLLM CUTLASS vs DeepGEMM difference: "
            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
        )
    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
    }
-    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
+    benchmark_results = {
        "shape": {
            "m": m,
            "n": n,
            "k": k
        },
        "implementations": {}
    }
    for name, func in implementations.items():
        # Warmup
@ -143,36 +140,38 @@ def benchmark_shape(
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
-                "DeepGEMM": 0.0
+                "DeepGEMM":
-                if name == "DeepGEMM"
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
-                else calc_diff(func(), C_deepgemm),
+                "Reference":
-                "Reference": deepgemm_diff
+                deepgemm_diff if name == "DeepGEMM" else
-                if name == "DeepGEMM"
+                (vllm_triton_diff
-                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
-            },
+            }
        }
        if verbose:
-            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
+            print(
                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
            )
    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name][
                "speedup_vs_deepgemm"] = speedup
            if verbose:
-                print(
+                print(f"DeepGEMM is {1/speedup:.2f}x "
-                    f"DeepGEMM is {1 / speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
                )
-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
+        "time_ms"]
    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
        "time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+    benchmark_results["implementations"]["vLLM CUTLASS"][
-        cutlass_vs_triton
+        "speedup_vs_triton"] = cutlass_vs_triton
    )
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +183,8 @@ def benchmark_shape(
 def format_table_row(values, widths):
    """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}"
                             for val, w in zip(values, widths)) + " |"
 def print_table(headers, rows, title=None):
@ -292,50 +292,38 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append(
+        deepgemm_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
-                shape["n"],
+        ])
                shape["k"],
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
            ]
        )
-    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers,
                deepgemm_rows,
                title="DeepGEMM Implementation:")
    # Print vLLM Triton table
-    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
+    triton_headers = [
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
    ]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append(
+        triton_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(speedup)
-                shape["k"],
+        ])
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(speedup),
            ]
        )
-    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
+    print_table(triton_headers,
                triton_rows,
                title="vLLM Triton Implementation:")
    # Print vLLM CUTLASS table
    cutlass_headers = [
-        "m",
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
-        "n",
+        "vs Triton"
        "k",
        "Time (μs)",
        "TFLOPS",
        "GB/s",
        "vs DeepGEMM",
        "vs Triton",
    ]
    cutlass_rows = []
    for result in all_results:
@ -343,27 +331,28 @@ def run_benchmarks(verbose: bool = False):
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append(
+        cutlass_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(vs_deepgemm),
-                shape["k"],
+            format_speedup(vs_triton)
-                f"{impl_data['time_us']:.1f}",
+        ])
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(vs_deepgemm),
                format_speedup(vs_triton),
            ]
        )
-    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers,
                cutlass_rows,
                title="vLLM CUTLASS Implementation:")
    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")
    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
-        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
+        impl: {
            "tflops": 0,
            "gb_s": 0,
            "time_ms": 0
        }
        for impl in implementations
    }
    for result in all_results:
@ -381,9 +370,9 @@ def run_benchmarks(verbose: bool = False):
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append(
+        avg_rows.append([
-            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
-        )
+        ])
    print_table(avg_headers, avg_rows)
@ -391,19 +380,21 @@ def run_benchmarks(verbose: bool = False):
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
    }
    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
            "time_ms"]
-        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
-        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+        avg_speedups[
-            vllm_triton_time / vllm_cutlass_time
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        )
+        avg_speedups[
            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +412,8 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"][
                "Reference"]
    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Union
+from typing import NamedTuple, Optional, Union
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
 class ClientArgs(NamedTuple):
    seed: int
-    max_num_requests: int | None
+    max_num_requests: Optional[int]
    skip_first_turn: bool
-    max_turns: int | None
+    max_turns: Optional[int]
    max_active_conversations: int
    verbose: bool
    print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
 class MetricStats:
    def __init__(self) -> None:
-        self.min: float | None = None
+        self.min: Optional[float] = None
-        self.max: float | None = None
+        self.max: Optional[float] = None
-        self.avg: float | None = None
+        self.avg: Optional[float] = None
        self.sum = 0.0
        self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
        self.index = 0
        self.sum = 0.0
        self.count = 0
-        self.avg: float | None = None
+        self.avg: Optional[float] = None
    def update(self, new_value: float) -> None:
        if self.count < self.window_size:
@ -198,6 +198,14 @@ class DebugStats:
        self.logger.info("-" * 50)
 # Must support Python 3.8, we can't use str.removeprefix(prefix)
 # introduced in Python 3.9
 def remove_prefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text
 def nanosec_to_millisec(value: float) -> float:
    return value / 1000000.0
@ -212,8 +220,8 @@ async def send_request(
    chat_url: str,
    model: str,
    stream: bool = True,
-    min_tokens: int | None = None,
+    min_tokens: Optional[int] = None,
-    max_tokens: int | None = None,
+    max_tokens: Optional[int] = None,
 ) -> ServerResponse:
    payload = {
        "model": model,
@ -242,9 +250,9 @@ async def send_request(
    timeout = aiohttp.ClientTimeout(total=timeout_sec)
    valid_response = True
-    ttft: float | None = None
+    ttft: Optional[float] = None
    chunk_delay: list[int] = []
-    latency: float | None = None
+    latency: Optional[float] = None
    first_chunk = ""
    generated_text = ""
@ -261,7 +269,7 @@ async def send_request(
                if not chunk_bytes:
                    continue
-                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
                if chunk == "[DONE]":
                    # End of stream
                    latency = time.perf_counter_ns() - start_time
@ -356,7 +364,7 @@ async def send_turn(
    req_args: RequestArgs,
    verbose: bool,
    verify_output: bool,
-) -> RequestStats | None:
+) -> Optional[RequestStats]:
    assert messages_to_use > 0
    assert messages_to_use <= len(conversation_messages)
@ -761,7 +769,7 @@ def get_client_config(
            "Number of conversations must be equal or larger than the number of clients"
        )
-    max_req_per_client: int | None = None
+    max_req_per_client: Optional[int] = None
    if args.max_num_requests is not None:
        # Max number of requests per client
        req_per_client = args.max_num_requests // args.num_clients
@ -1024,7 +1032,7 @@ def process_statistics(
    warmup_percentages: list[float],
    test_params: dict,
    verbose: bool,
-    gen_conv_args: GenConvArgs | None = None,
+    gen_conv_args: Optional[GenConvArgs] = None,
    excel_output: bool = False,
 ) -> None:
    if len(client_metrics) == 0:
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@ -0,0 +1,49 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.lint.isort]
 known-first-party = ["vllm"]
 [tool.ruff.format]
 docstring-code-format = true
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -213,7 +213,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -227,7 +226,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "ON")
+    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
    FetchContent_MakeAvailable(oneDNN)
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -1,97 +0,0 @@
 include(FetchContent)
 set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
 if(DEFINED ENV{QUTLASS_SRC_DIR})
  set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
 endif()
 if(QUTLASS_SRC_DIR)
  FetchContent_Declare(
    qutlass
    SOURCE_DIR ${QUTLASS_SRC_DIR}
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
 else()
  FetchContent_Declare(
    qutlass
    GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
    GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
    GIT_PROGRESS TRUE
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
  FetchContent_Populate(qutlass)
  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
 message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
 cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
  if(QUTLASS_ARCHS MATCHES "10\\.0a")
    set(QUTLASS_TARGET_CC 100)
  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
    set(QUTLASS_TARGET_CC 120)
  else()
    message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
  endif()
  set(QUTLASS_SOURCES
    ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
  )
  set(QUTLASS_INCLUDES
    ${qutlass_SOURCE_DIR}
    ${qutlass_SOURCE_DIR}/qutlass
    ${qutlass_SOURCE_DIR}/qutlass/csrc/include
    ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
  )
  if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
    list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
  elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
    list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
    message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
  else()
    message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
                        "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
  endif()
  set_gencode_flags_for_srcs(
    SRCS "${QUTLASS_SOURCES}"
    CUDA_ARCHS "${QUTLASS_ARCHS}"
  )
  target_sources(_C PRIVATE ${QUTLASS_SOURCES})
  target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
  target_compile_definitions(_C PRIVATE
    QUTLASS_DISABLE_PYBIND=1
    TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
  )
  set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
  )
 else()
  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
    message(STATUS
      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
  else()
    message(STATUS
      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
      "CUDA_ARCHS='${CUDA_ARCHS}'.")
  endif()
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -16,7 +16,7 @@ import shutil
 from torch.utils.hipify.hipify_python import hipify
-if __name__ == "__main__":
+if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
    )
    # Source files to convert.
-    parser.add_argument(
+    parser.add_argument("sources",
-        "sources", help="Source files to hipify.", nargs="*", default=[]
+                        help="Source files to hipify.",
-    )
+                        nargs="*",
                        default=[])
    args = parser.parse_args()
    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, "*")]
+    includes = [os.path.join(args.project_dir, '*')]
    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
-    hipify_result = hipify(
+    hipify_result = hipify(project_directory=args.project_dir,
-        project_directory=args.project_dir,
+                           output_directory=args.output_dir,
-        output_directory=args.output_dir,
+                           header_include_dirs=[],
-        header_include_dirs=[],
+                           includes=includes,
-        includes=includes,
+                           extra_files=extra_files,
-        extra_files=extra_files,
+                           show_detailed=True,
-        show_detailed=True,
+                           is_pytorch_extension=True,
-        is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
        hipify_extra_files_only=True,
    )
    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
-        hipified_s_abs = (
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-            hipify_result[s_abs].hipified_path
+                          (s_abs in hipify_result
-            if (
+                           and hipify_result[s_abs].hipified_path is not None)
-                s_abs in hipify_result
+                          else s_abs)
                and hipify_result[s_abs].hipified_path is not None
            )
            else s_abs
        )
        hipified_sources.append(hipified_s_abs)
-    assert len(hipified_sources) == len(args.sources)
+    assert (len(hipified_sources) == len(args.sources))
    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -28,10 +28,10 @@
 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
-  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
+  #include "../quantization/fp8/amd/quant_utils.cuh"
 typedef __hip_bfloat16 __nv_bfloat16;
 #else
-  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+  #include "../quantization/fp8/nvidia/quant_utils.cuh"
 #endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -64,11 +64,3 @@ void indexer_k_quant_and_cache(
    torch::Tensor& slot_mapping,  // [num_tokens]
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,   // [batch_size, num_blocks]
    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -9,14 +9,16 @@
 #include "quantization/vectorization_utils.cuh"
 #ifdef USE_ROCM
-  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
+  #include "quantization/fp8/amd/quant_utils.cuh"
 #else
-  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+  #include "quantization/fp8/nvidia/quant_utils.cuh"
 #endif
 #include <algorithm>
 #include <cassert>
-#include <cfloat>
+#include <cfloat>  // FLT_MIN
 #include <map>
 #include <vector>
 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
 namespace vllm {
 // Used to copy/convert one element
 template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
 struct CopyWithScaleOp {
  float scale;
  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
      dst = static_cast<OutT>(src);
    } else {
      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
    }
  }
 };
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
    // Padding token that should be ignored.
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  const int h_block_count = head_size / x;  // head_size//x
-  const int h_block_idx = threadIdx.x;
+  const int n = num_heads * head_size;
-  if (h_block_idx >= num_heads * h_block_count) {
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    return;
+    const int64_t src_key_idx = token_idx * key_stride + i;
-  }
+    const int64_t src_value_idx = token_idx * value_stride + i;
-  const int head_idx = h_block_idx / h_block_count;
+    const int head_idx = i / head_size;
-  const int h_block = h_block_idx % h_block_count;
+    const int head_offset = i % head_size;
    const int x_idx = head_offset / x;
    const int x_offset = head_offset % x;
-  const scalar_t* __restrict__ key_src =
+    const int64_t tgt_key_idx =
-      key + token_idx * key_stride + head_idx * head_size + h_block * x;
+        block_idx * num_heads * (head_size / x) * block_size * x +
-  const int64_t src_value_start =
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-      token_idx * value_stride + head_idx * head_size + h_block * x;
+        block_offset * x + x_offset;
-
+    const int64_t tgt_value_idx =
-  cache_t* __restrict__ key_dst =
+        block_idx * num_heads * head_size * block_size +
-      key_cache + block_idx * num_heads * h_block_count * block_size * x +
+        head_idx * head_size * block_size + head_offset * block_size +
-      head_idx * h_block_count * block_size * x + h_block * block_size * x +
+        block_offset;
-      block_offset * x;
+    scalar_t tgt_key = key[src_key_idx];
-  const int64_t tgt_value_start =
+    scalar_t tgt_value = value[src_value_idx];
-      block_idx * num_heads * h_block_count * x * block_size +
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      head_idx * h_block_count * x * block_size + h_block * x * block_size +
+      key_cache[tgt_key_idx] = tgt_key;
-      block_offset;
+      value_cache[tgt_value_idx] = tgt_value;
-
+    } else {
-  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+      key_cache[tgt_key_idx] =
-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+      value_cache[tgt_value_idx] =
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+    }
  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
  const scalar_t* __restrict__ value_src = value + src_value_start;
  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
 #pragma unroll
  for (int i = 0; i < x; i++) {
    v_op(value_dst[i * block_size], value_src[i]);
  }
 }
 // Used by vectorization_utils to copy/convert one element
 template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
 struct CopyWithScaleOp {
  float scale;
  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
      dst = static_cast<OutT>(src);
    } else {
      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
    }
  }
 };
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -428,81 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
  const int64_t dst_idx_start =
      block_idx * block_stride + block_offset * entry_stride;
-  // For the NoPE part, each tile of 128 elements is handled by half of one warp
+  // Create 4 tile scales in shared memory
-  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+  __shared__ float smem[20];
-  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+  float* shard_abs_max = smem;
-  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
+  float* tile_scales = smem + 16;
-  // So in total, we use 3 warps (96 threads) per block.
+
  // For the NoPE part, each tile of 128 elements is handled by 4 warps
  // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
  // The first thread of the first warp in each tile writes the scale
  // value for the tile. The RoPE part (last 64 elements) is handled
  // by another 2 warps (64 threads).
  // So in total, we use 18 warps (576 threads) per block.
  // Cast kv_cache to 16_bit for RoPE values
  scalar_t* kv_cache_16bit =
      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
-  // The last warp handles the RoPE part
+  // The last 64 threads handle the RoPE part
-  if (threadIdx.x >= 64) {
+  if (threadIdx.x >= kv_lora_rank) {
-    // Each thread handles two elements of RoPE
+    const int8_t pe_idx = threadIdx.x - kv_lora_rank;
-    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
    // Vectorized load of two 16-bit values, performed as one 32-bit load
    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
    // RoPE values start after the packed 8-bit NoPE values and the
    // 32-bit scales
-    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
-    // Vectorized store of two 16-bit values, performed as one 32-bit store
+    kv_cache_16bit[dst_idx] = k_pe[src_idx];
    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
    return;
  }
-  // The first two warps handle the NoPE part
+  // Determine the scale for each chunk of NoPE
-  const int8_t warp_idx = threadIdx.x >> 5;
+  const int16_t tile_idx = threadIdx.x >> 7;
-  const int8_t lane_idx = threadIdx.x & 31;
+  const int16_t warp_idx = (threadIdx.x & 127) >> 5;
-  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
+  const int16_t lane_idx = threadIdx.x & 31;
-  // Each thread handles 8 elements of NoPE
+  // Load the NoPE element for this thread into registers
-  // Load the NoPE elements for this thread into registers
+  const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
-  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
+  const scalar_t src_val = kv_c[src_idx];
  // Vectorized load of eight 16-bit values, performed as an int4 load
  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
-  // Max absolute value of this thread's elements
+  // Warp-level reduction to find the max absolute value in the warp
-  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
+  float max_abs = fabsf(src_val);
                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
  // Warp-level reduction to find the max absolute value in each half-warp
 #pragma unroll
-  for (int offset = 8; offset > 0; offset /= 2) {
+  for (int offset = 16; offset > 0; offset /= 2) {
-    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
+#ifdef USE_ROCM
    max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
 #else
    max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
 #endif
  }
-  // Compute the scale for the tile
+  // The first lane of each warp in each tile writes the max_abs of this part
-  float tile_scale = max_abs / 448.f;
+  // of the tile to shared memory
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
+  if (lane_idx == 0) {
    shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
  }
  __syncthreads();
-  // The first lane of each half-warp writes the scale to kv_cache
+  // The first lane of the first warp in each tile computes the scale for the
-  if ((lane_idx == 0) || (lane_idx == 16)) {
+  // tile and writes it to shared memory and to kv_cache
  if (warp_idx == 0 && lane_idx == 0) {
    float4 shard_abs_max_vec =
        reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
    float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
                             fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
                       448.f;
    // Avoid division by zero in `scaled_convert`
    tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
-    kv_cache_32bit[dst_idx] = tile_scale;
+    kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
  }
-  // Now all threads in the block scale and write their elements
+  __syncthreads();
  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
-  uint8_t result[8];
+  // Now all threads in the block scale and write their element
-#pragma unroll
+  const float scale_val = tile_scales[tile_idx];
-  for (int i = 0; i < 8; i++) {
+  const int64_t dst_idx = dst_idx_start + threadIdx.x;
-    result[i] =
+  kv_cache[dst_idx] =
-        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+      fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
-            vals[i], tile_scale);
+          src_val, scale_val);
  }
  // Store as aligned 64-bit writes
  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
      *reinterpret_cast<const uint64_t*>(result);
 }
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -572,70 +571,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
  }
 }
 template <int BLOCK_Y_SIZE>
 __global__ void cp_gather_indexer_k_quant_cache_kernel(
    const char* __restrict__ kv_cache,  // [num_blocks, block_size,
                                        // cache_stride]
    char* __restrict__ dst_k,           // [num_tokens, head_dim]
    char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size *
                                   // 4]
    const int* __restrict__ block_table,  // [batch_size, num_blocks]
    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
    const int batch_size,                 // batch size
    const int64_t token_stride,           // stride for each token in dst_k
    const int64_t head_dim,               // dimension of each head
    const int64_t block_stride,           // stride for each block in kv_cache
    const int64_t cache_token_stride,     // stride for each token in kv_cache
    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
    const int num_blocks,            // number of blocks
    const int num_tokens,            // number of tokens
    const int quant_block_size       // quantization block size
 ) {
  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
  // Find batch index within a block
  __shared__ int batch_idx[BLOCK_Y_SIZE];
  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
       iter++) {
    int tid = iter * blockDim.x + threadIdx.x;
    if (tid < batch_size) {
      const int seq_start = cu_seq_lens[tid];
      const int seq_end = cu_seq_lens[tid + 1];
      if (token_idx >= seq_start && token_idx < seq_end) {
        batch_idx[threadIdx.y] = tid;
      }
    }
  }
 #ifndef USE_ROCM
  __syncwarp();
 #endif
  if (head_idx >= head_dim || token_idx >= num_tokens) {
    return;
  }
  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
                                    inbatch_seq_idx / cache_block_size];
  const int64_t src_block_offset = block_idx * block_stride;
  const int64_t cache_inblock_offset =
      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
  ;
  if (threadIdx.x == 0) {
    const int64_t src_scale_offset =
        src_block_offset + cache_block_size * head_dim +
        cache_inblock_offset * 4 / quant_block_size;
    reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
        reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
  }
 }
 }  // namespace vllm
 // KV_T is the data type of key and value tensors.
@ -671,10 +606,9 @@ void reshape_and_cache(
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
  int head_div_x = head_size / x;
  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_div_x, 512));
+  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -807,12 +741,13 @@ void concat_and_cache_mla(
  if (kv_cache_dtype == "fp8_ds_mla") {
    dim3 grid(num_tokens);
-    // For the NoPE part, each tile of 128 elements is handled by half of one
+    // For the NoPE part, each tile of 128 elements is handled by 4 warps
-    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
+    // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
-    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
+    // The first thread of the first warp in each tile writes the scale
-    // The RoPE part (last 64 elements) is handled by another 1 warp (32
+    // value for the tile. The RoPE part (last 64 elements) is handled
-    // threads). So in total, we use 3 warps (96 threads) per block.
+    // by another 2 warps (64 threads).
-    dim3 block(96);
+    // So in total, we use 18 warps (576 threads) per block.
    dim3 block(576);
    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                               CALL_CONCAT_AND_CACHE_DS_MLA);
  } else {
@ -1237,59 +1172,3 @@ void indexer_k_quant_and_cache(
  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
                             CALL_INDEXER_K_QUANT_AND_CACHE);
 }
 // Macro to dispatch the kernel based on the data amount.
 #define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \
  vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \
      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
          reinterpret_cast<char*>(dst_scale.data_ptr()),                    \
          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
          num_tokens, quant_block_size);
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,  // [batch_size, num_blocks]
    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
 ) {
  int batch_size = block_table.size(0);
  int num_tokens = dst_k.size(0);
  int head_dim = dst_k.size(1);
  int quant_block_size = head_dim * 4 / dst_scale.size(1);
  TORCH_CHECK(kv_cache.device() == dst_k.device(),
              "kv_cache and dst_k must be on the same device");
  TORCH_CHECK(kv_cache.device() == dst_scale.device(),
              "kv_cache and dst_scale must be on the same device");
  TORCH_CHECK(kv_cache.device() == block_table.device(),
              "kv_cache and block_table must be on the same device");
  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
              "kv_cache and cu_seq_lens must be on the same device");
  TORCH_CHECK(head_dim % quant_block_size == 0,
              "head_dim must be divisible by quant_block_size");
  constexpr int vec_size = 16;
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (num_tokens < 32) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
  } else if (num_tokens < 64) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
  } else if (num_tokens < 128) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
  } else if (num_tokens < 256) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
  } else if (num_tokens < 512) {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
  } else {
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
 }
 void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc original_b_md,
+    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
-    dnnl::memory::desc b_target_mem_desc) {
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
  {
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
    assert(!use_azp_);
  };
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+  prepack_weight(args.b_ptr,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  assert(ab_type_ == dnnl::memory::data_type::f32 ||
         ab_type_ == dnnl::memory::data_type::bf16 ||
         ab_type_ == dnnl::memory::data_type::f16);
-
+  prepack_weight(args.b_ptr,
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-#ifdef VLLM_USE_ACL
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-                         // Arm Compute Library (ACL) backend for oneDNN does
+                                   .use_bias = false,
-                         // not support runtime
+                                   .bias_type = dnnl::memory::data_type::undef},
                         // dimensions, so we set M to a default value
                         .a_m_size = 128,
                         .a_m_stride = b_k_size_,
 #else
                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
 #endif
                         .use_bias = false,
                         .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;
 #ifndef VLLM_USE_ACL
  // We do not support in ACL backend of oneDNN, we handle bias by:
  // 1. copying it into the result tensor
  // 2. attaching a fused-sum post-op to the matmul primitive
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
 #endif
  dnnl::matmul matmul = get_matmul_cache(args);
-// With ACL backend of oneDNN, the required memory format might change when the
+  dnnl::matmul matmul = get_matmul_cache(args);
 // source tensor dims change. This does not really happen in practice, so isn't
 // a performance hit, but we need to support it because the API allows for it.
 #ifdef VLLM_USE_ACL
  auto new_expected_wei_desc =
      dnnl::matmul::primitive_desc(
          const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
          .weights_desc();
  if (new_expected_wei_desc != b_target_mem_desc_) {
    prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
                   b_target_mem_desc_, new_expected_wei_desc);
  }
 #endif
  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
  scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  } else {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              {key.a_m_stride, 1});
 #ifdef VLLM_USE_ACL
    // ACL's backend of oneDNN always expects the weight format to be "any"
    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
                              dnnl::memory::format_tag::any);
 #else
    b_md = b_target_mem_desc_;
 #endif
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  if (key.use_bias) {
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
 // Since ACL's matmuls don't support passing a bias_md, we apply the bias
 // through a fused-sum post-op
 #ifdef VLLM_USE_ACL
    dnnl::post_ops post_ops;
    post_ops.append_sum();
    attr.set_post_ops(post_ops);
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
 #else
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
 #endif
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
 // ACL matmuls don't support bias_md, so we don't need these
 #ifndef VLLM_USE_ACL
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-#endif
+
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 bool is_onednn_acl_supported() {
 #ifdef VLLM_USE_ACL
  return true;
 #else
  return false;
 #endif
 }
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
 protected:
  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
-  void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
+  void prepack_weight(void* original_b_ptr,
                      dnnl::memory::desc b_target_mem_desc);
  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
  torch::Tensor a_contig = a.contiguous();
 #endif
  MatMulPrimitiveHandler::ExecArgs exec_args;
 #ifdef VLLM_USE_ACL
  exec_args.a_m_size = a_contig.size(0);
  exec_args.a_m_stride = a_contig.stride(0);
 #else
  exec_args.a_m_size = a.size(0);
  exec_args.a_m_stride = a.stride(0);
-#endif
+
  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
    if (bias.has_value()) {
      exec_args.use_bias = true;
      exec_args.bias_type = get_dnnl_type<scalar_t>();
 #ifdef VLLM_USE_ACL
      // ACL matmuls in oneDNN do not support a bias.
      // We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
      c.copy_(bias.value());
 #else
      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
 #endif
    } else {
      exec_args.use_bias = false;
      exec_args.bias_type = get_dnnl_type<void>();
      exec_args.bias_ptr = nullptr;
    }
 #ifdef VLLM_USE_ACL
    exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
 #else
    exec_args.a_ptr = a.data_ptr<scalar_t>();
 #endif
    exec_args.c_ptr = c.data_ptr<scalar_t>();
    ptr->execute(exec_args);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
               const std::optional<torch::Tensor>& bias, int64_t handler);
 bool is_onednn_acl_supported();
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -183,9 +181,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Check if oneDNN was built with ACL backend
  ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
  #endif  // CUB_VERSION
 #else
  #include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
+using CubAddOp = cub::Sum;
-using CubAddOp = hipcub::Sum;
+using CubMaxOp = cub::Max;
 using CubMaxOp = hipcub::Max;
 #endif  // USE_ROCM
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "u4b8",
        VLLMDataType.u8b128: "u8b128",
-    },
+    }
 }
 VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    },
+    }
 }
 VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **{
        VLLMDataType.u4b8: 4,
        VLLMDataType.u8b128: 8,
-    },
+    }
 }
 VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }
-VLLMKernelScheduleTag: dict[
+VLLMKernelScheduleTag: dict[Union[
-    Union[MixedInputKernelScheduleType, KernelScheduleType], str
+    MixedInputKernelScheduleType, KernelScheduleType], str] = {
-] = {
+        **KernelScheduleTag,  # type: ignore
-    **KernelScheduleTag,  # type: ignore
+        **{
-    **{
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
-        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
+            "cutlass::gemm::KernelTmaWarpSpecialized",
-        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
-        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
-    },
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
-}
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
        }
    }
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -6,7 +6,7 @@
 */
 #include "type_convert.cuh"
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -17,30 +17,25 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
@ -63,12 +58,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -100,11 +100,6 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& output_mask,
                                 const torch::Tensor& repetition_penalties);
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
                   torch::Tensor& values, int64_t numRows, int64_t stride0,
                   int64_t stride1);
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
                               double epsilon);
@ -138,12 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
-void persistent_masked_m_silu_mul_quant(
+void silu_mul_fp8_quant_deep_gemm_cuda(
    const at::Tensor& input,   // (E, T, 2*H)
    const at::Tensor& counts,  // (E)
    at::Tensor& y_q,           // (E, T, H) [OUT]
    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    bool use_ue8m0);
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -7,7 +7,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"
 #include <c10/util/Float8_e4m3fn.h>
@ -114,22 +114,13 @@ __global__ void act_and_mul_quant_kernel(
 }
 __device__ __forceinline__ float silu(float x) {
-  return __fdividef(x, (1.f + expf(-x)));
+  return (__fdividef(x, (1.f + expf(-x))));
 }
 __device__ __forceinline__ float2 silu2(float2 x) {
  return make_float2(silu(x.x), silu(x.y));
 }
 __device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
 #ifndef USE_ROCM
  return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
                        __float2bfloat16_rn(silu(x.y)));
 #else
  return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
 #endif
 }
 #ifndef USE_ROCM
 __device__ __forceinline__ float warp_max(float v) {
  static constexpr unsigned FULL_MASK = 0xffffffffu;
@ -232,308 +223,224 @@ constexpr __nv_bfloat16 get_fp8_min() {
    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
  }
 }
-
+#ifndef USE_ROCM
-template <typename Idx_t>
+template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
-__device__ __forceinline__ int warp_expert_search(
+          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
    int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
  const Idx_t* input_ptr = input + idx;
  int base_offset = 0;
  for (;;) {
    bool move_on = (idx < n && *input_ptr <= val);
    unsigned mask = __ballot_sync(0xffffffff, move_on);
    if (mask != 0xffffffffu) {
      int last_lane = 31 - __clz(mask);
      return base_offset + last_lane;
    }
    input_ptr += 32;
    base_offset += 32;
    idx += 32;
  }
 }
 template <int num_parallel_tokens>
 __device__ __forceinline__ void token_bounds(int32_t n_tokens,
                                             int32_t worker_id,
                                             int32_t& n_tokens_lower,
                                             int32_t& n_tokens_upper) {
  if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
    if (worker_id >= num_parallel_tokens) return;
    n_tokens_lower = worker_id;
    n_tokens_upper = worker_id + 1;
  } else {
    int32_t chunk_size = n_tokens / num_parallel_tokens;
    int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
    auto calc_id = [&](int32_t id) {
      if (id < residual)
        return min(n_tokens, id * (chunk_size + 1));
      else
        return min(n_tokens, id * chunk_size + residual);
    };
    n_tokens_lower = calc_id(worker_id);
    n_tokens_upper = calc_id(worker_id + 1);
  }
 }
 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
          int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
    // sizes
-    Idx_t E, Idx_t T, Idx_t H,
+    int H, int G,
    // strides (in elements)
    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
    Idx_t stride_ys_g, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
  static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
  static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
  static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
  static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
  static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
  extern __shared__ __align__(16) __int128_t smem_128[];
  int* s_expert_offsets =
      reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
-  // We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
+  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
  int tid = threadIdx.x;
  int warp_id = tid >> 5;
  int lane_id = tid & 0x1f;
-  int running_sum{};
+  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
-  if (!warp_id) {
+  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
    for (int i = 0; i < E; i += WARP_SIZE) {
      bool valid = (i + threadIdx.x) < E;
      int value =
          (valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
          (!lane_id ? running_sum : 0);
-      for (int offset = 1; offset < 32; offset *= 2) {
+  // We split the shared memory in half, corresponding to gate and up matrices:
-        int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
+  // [...gate_i, ...up_i]  where 0 <= i < stages.
-        if (lane_id >= offset) value += n;
+  static constexpr int32_t S_NUM_128 =
-      }
+      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
-      if (valid) {
+  const int32_t tid = threadIdx.x;
-        s_expert_offsets[i + threadIdx.x + 1] = value;
+  const int32_t warp_id = tid / WARP_SIZE;
-      }
+  const int32_t lane_id = tid % WARP_SIZE;
-      running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
+  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
    }
-    if (!lane_id) {
+  // block handles one (expert e, group g)
-      s_expert_offsets[0] = 0;
+  int32_t pid = blockIdx.x;
-    }
+  int32_t e = pid / G;
  int32_t g = pid % G;
  const int32_t n_tokens = counts[e * stride_counts_e];
  if (!n_tokens) {
    return;  // Exit ASAP.
  }
-  __syncthreads();
+  const Idx_t stride_i_t_128 = stride_i_t / 8u;
-  int32_t total_tokens = s_expert_offsets[E];
+  int32_t n_tokens_lower, n_tokens_upper;
  const int warp_position_yq = warp_id * (H / NUM_WARPS);
  const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
  // A single block will handle tokens_per_block tokens.
  // Each block i iterates over tokens of a slice of n_tokens =
  // expert_counts[i], with the size of chunk being
  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
-
+  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
-  // Each warp will get space to store its hidden dim for gate and up.
+    // Specialize this, but can be likely fused.
-  __int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
+    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
-  __int128_t* smem_load_ptr = s_hidden_load + lane_id;
+      return;
-
+    }
-  const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
+    n_tokens_lower = blockIdx.y;
-
+    n_tokens_upper = blockIdx.y + 1;
  int32_t compute_pipeline_offset_64 = 0;
  int32_t load_stage_offset{};
  const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
  __int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
                                warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
                                lane_id;
  __int64_t* s_gate64_ptr = smem_compute_ptr;
  __int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
  int tokens_lower, tokens_upper;
  token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
                            tokens_upper);
  Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
  int token_id = tokens_lower;
  int32_t t_load{};
  if (token_id < tokens_upper) {
    expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
    expert_offset = s_expert_offsets[expert_id];
    next_expert_offset = s_expert_offsets[expert_id + 1];
  } else {
-    // This thread block has no work to do.
+    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
    auto calc_id = [&](int32_t id) {
      if (id < residual) {
        return min(n_tokens, id * (chunk_size + 1));
      } else {
        return min(n_tokens, id * chunk_size + residual);
      }
    };
    n_tokens_lower = calc_id(blockIdx.y);
    n_tokens_upper = calc_id(blockIdx.y + 1);
  }
  if (n_tokens_lower >= n_tokens_upper) {
    return;
  }
-  int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
+  // We do calculations here, using constexpr wherever possible.
  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
  const Idx_t base_yq =
      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
                      stride_i_t_128 * n_tokens_lower;
  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
  auto y_s_ptr =
      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
                 stride_yq_t * n_tokens_lower + 4 * lane_id;
  int32_t t_load = n_tokens_lower, load_stage_id = 0;
  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
  int32_t stage_offset{};
-  Idx_t base_i = ((expert_id * stride_i_e) / 8) +
+  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
-                 (token_id - expert_offset) * stride_i_t / 8;
+  static constexpr int32_t LOAD_STAGE_MOD =
-  const Idx_t gate_warp_offset =
+      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
      warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
  const __int128_t* input_128_ptr =
      reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
      ((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
  __int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
  auto token_offset = token_id - expert_offset;
  // Two halves of all threads in a block conduct global loads for gate and up,
  // repsectively.
  auto load_and_advance_y_pred = [&] {
-    if (t_load < t_load_bound) {
+    if (t_load < n_tokens_upper) {
-      // Here we are simply continuing to load data
+      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
-      // from the current token.
+      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
      // unnecessary ALU ops.
-      load_stage_offset += LOAD_STAGE_SIZE;
+      stage_offset += LOAD_STAGE_SIZE;
-      load_stage_offset %= LOAD_STAGE_MOD;
+      stage_offset %= LOAD_STAGE_MOD;
-      cp_async4(smem_load_ptr_staged, load_ptr);
+      if (tid < HALF_THREAD_COUNT) {
-      load_ptr += GROUP_SIZE / 8;
+        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
-      ++t_load;
+        gate_128_ptr += stride_i_t_128;
    } else if (token_id + 1 < tokens_upper) {
      // We loaded everything from the current token, let's move on
      // to the next one, and we checked that we have more tokens to load.
      ++token_id;
      t_load = 0;
      if (token_id >= next_expert_offset) {
        // We need to find the next expert.
        do {
          // This is a loop because it's possible
          // that some experts are assigned 0 tokens.
          // NOTE: We are guaranteed that there's at least
          // one more token left so we don't have to check for
          // expert_id bounds.
          ++expert_id;
          // This skips 1 memory read.
          expert_offset = next_expert_offset;
          next_expert_offset = s_expert_offsets[expert_id + 1];
        } while (next_expert_offset == expert_offset);
        base_i = expert_id * (stride_i_e / 8);
        token_offset = 0;
        load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
      } else {
-        // We remain within the same expert, so just
+        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
-        // move by H/4 __int128_t (2 * H/8).
+        up_128_ptr += stride_i_t_128;
        base_i += stride_yq_t / 4;
        token_offset++;
      }
      load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
      auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
      // unnecessary ALU ops.
      load_stage_offset += LOAD_STAGE_SIZE;
      load_stage_offset %= LOAD_STAGE_MOD;
      cp_async4(smem_load_ptr_staged, load_ptr);
      load_ptr += GROUP_SIZE / 8;
      ++t_load;
      ++load_stage_id;
    }
    // We fence even if there is nothing to load to simplify pipelining.
    cp_async_fence();
  };
  // We need to warm-up the pipeline.
  #pragma unroll
  for (int i = 0; i < NUM_STAGES - 1; i++) {
    load_and_advance_y_pred();
  }
-  __nv_fp8x4_e4m3* y_q_base_ptr =
+  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
-      reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
+                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+                          lane_id;
  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
-  for (auto j = tokens_lower; j < tokens_upper; j++) {
+  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
-    const Idx_t base_ys = expert_id * stride_ys_e;
+  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
    auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
    __nv_fp8x4_e4m3* y_q_ptr =
        y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
                        warp_position_yq * stride_yq_h) /
                           4;
    const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
-    for (int i = 0; i < COMPUTE_LIMIT; i++) {
+  int32_t compute_pipeline_offset_64 = 0;
      cp_async_wait<NUM_STAGES - 2>();
      __syncthreads();
      load_and_advance_y_pred();
-      __int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
+  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-      __int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
+    __nv_bfloat162 results_bf162[2];
-      // COMPUTE_STAGE_SIZE/MOD must also be constexpr!
+    cp_async_wait<NUM_STAGES - 2>();
-      compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
+    __syncthreads();
      compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
-      __int64_t gate64 = *gate64_ptr;
+    // We double-buffer pipelined loads so that the next load will
-      __int64_t up64 = *up64_ptr;
+    // concurrently run with compute without overwrites.
    load_and_advance_y_pred();
-      // Compute
+    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
-      __nv_bfloat162 res[2];
+    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
-      __nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
+
-      __nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
+    // STAGE_SIZE must also be constexpr!
    compute_pipeline_offset_64 += STAGE_SIZE;
    compute_pipeline_offset_64 %= STAGE_MOD;
    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
    __int64_t gate64 = *s_gate_compute_64;
    __nv_bfloat162* s_gate_compute_32 =
        reinterpret_cast<__nv_bfloat162*>(&gate64);
    __int64_t up64 = *s_up_compute_64;
    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
  #pragma unroll
-      for (int32_t k = 0; k < 2; ++k) {
+    for (int i = 0; i < 2; i++) {
-        __nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
+      // For silu, we make sure that div is emitted.
-        res[k] = __hmul2(gate, s_up_comp[k]);
+      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
-      }
+      results_bf162[i] = __float22bfloat162_rn(gate);
-
+    }
      auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
      _y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
      __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
      if constexpr (USE_UE8M0) {
        y_s = hexp2(hceil(hlog2(y_s)));
      }
      __nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
      auto y_s2 = make_bfloat162(inv_y, inv_y);
  #pragma unroll
-      for (int32_t k = 0; k < 2; ++k) {
+    for (int i = 0; i < 2; i++) {
-        res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
+      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
-                      __bfloat162bfloat162(fp8_max));
+    }
      }
-      *y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
+    auto _y_max2 =
-      y_q_ptr += WARP_SIZE * stride_yq_h;
+        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
-      if (!lane_id) {
+    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
-        *y_s_ptr = y_s;
+
-        y_s_ptr += stride_ys_g;
+    // An entire group is assigned to a single warp, so a simple warp reduce
-      }
+    // is used.
    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
    if constexpr (USE_UE8M0) {
      y_s = hexp2(hceil(hlog2(y_s)));
    }
    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
    auto y_s2 = make_bfloat162(inv_y, inv_y);
  #pragma unroll
    for (int32_t i = 0; i < 2; ++i) {
      results_bf162[i] =
          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
               __bfloat162bfloat162(fp8_max));
    }
    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
    y_q_ptr += stride_yq_t;
    if (lane_id == 0) {
      *y_s_ptr = y_s;
      y_s_ptr += stride_ys_t;
    }
  }
 #endif
 }
 #endif
 }  // namespace vllm
@ -568,14 +475,14 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
-void persistent_masked_m_silu_mul_quant(
+void silu_mul_fp8_quant_deep_gemm_cuda(
-    const at::Tensor& input,              // (E, T, 2*H)
+    const at::Tensor& input,   // (E, T, 2*H)
-    const at::Tensor& tokens_per_expert,  // (E)
+    const at::Tensor& counts,  // (E)
-    at::Tensor& y_q,                      // (E, T, H) [OUT]
+    at::Tensor& y_q,           // (E, T, H) [OUT]
-    at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
 #ifndef USE_ROCM
-
+  // This kernel relies heavily on cp.async and fp8 support.
  // This kernel currently only supports H % 128 == 0 and assumes a
  // fixed GROUP_SIZE of 128.
  TORCH_CHECK(input.dtype() == torch::kBFloat16);
@ -584,6 +491,10 @@ void persistent_masked_m_silu_mul_quant(
  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
  TORCH_CHECK(input.size(-1) % 256 == 0);
  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
  using Idx_t = int64_t;
  Idx_t E = input.size(0);
@ -599,54 +510,81 @@ void persistent_masked_m_silu_mul_quant(
  Idx_t stride_ys_t = y_s.stride(1);
  Idx_t stride_ys_g = y_s.stride(2);
-  Idx_t stride_counts_e = tokens_per_expert.stride(0);
+  Idx_t stride_counts_e = counts.stride(0);
  static constexpr int GROUP_SIZE = 128;
  #define KERNEL_FN                                                         \
    if (use_ue8m0) {                                                        \
      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
                                                NUM_PARALLEL_TOKENS, true>  \
          <<<grid, block, 0, stream>>>(                                     \
              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
              stride_counts_e);                                             \
    } else {                                                                \
      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
                                                NUM_PARALLEL_TOKENS, false> \
          <<<grid, block, 0, stream>>>(                                     \
              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
              stride_counts_e);                                             \
    }
  #define KERNEL_CALL_H                                       \
    if (H % (4 * GROUP_SIZE) == 0) {                          \
      static constexpr int NUM_WARPS = 4;                     \
      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
      KERNEL_FN                                               \
    } else {                                                  \
      static constexpr int NUM_WARPS = 1;                     \
      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
      KERNEL_FN                                               \
    }
  #define KERNEL_CALL_TOP_LEVEL                      \
    if (num_parallel_tokens == 1) {                  \
      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 2) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 4) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 8) {           \
      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 16) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 16; \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 32) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 32; \
      KERNEL_CALL_H                                  \
    } else if (num_parallel_tokens == 64) {          \
      static constexpr int NUM_PARALLEL_TOKENS = 64; \
      KERNEL_CALL_H                                  \
    }
  Idx_t G;
  dim3 block, grid;
  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
    G = H / Idx_t(group_size * num_warps);
    grid = dim3(E * G, _num_parallel_tokens);
    block = dim3(num_warps * WARP_SIZE);
  };
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
-    static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
+                          "silu_mul_fp8_quant_deep_gemm_kernel",
-    int sms = SILU_V2_BLOCK_COUNT;                                             \
+                          [&] { KERNEL_CALL_TOP_LEVEL });
    static constexpr int max_shared_mem_bytes =                                \
        GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2;                               \
    dim3 grid(sms), block(THREAD_COUNT);                                       \
    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));          \
    VLLM_DISPATCH_FP8_TYPES(                                                   \
        y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
          vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
              <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                  reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
                  reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                  T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
                  stride_ys_g, stride_counts_e);                               \
        });
  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
  if (!use_ue8m0) {
    if (H >= 4096) {
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
    } else {
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
    }
  } else {
    if (H >= 4096) {
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
    } else {
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
    }
  }
 #endif
 }
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,11 +1,15 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #ifndef USE_ROCM
  #include "../per_token_group_quant_8bit.h"
 #endif
 #include <cmath>
-#include "dispatch_utils.h"
+#include "../../cub_helpers.h"
-#include "quantization/vectorization_utils.cuh"
+#include "../../dispatch_utils.h"
-#include "cub_helpers.h"
+#include "../vectorization_utils.cuh"
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
  float dst = std::nearbyint(x);
  // saturate
  // See https://github.com/pytorch/pytorch/issues/127666
  // See https://github.com/llvm/llvm-project/issues/95183
  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
      static_cast<int32_t>(std::numeric_limits<int8_t>::max());
  // saturate
  // See https://github.com/pytorch/pytorch/issues/127666
  // See https://github.com/llvm/llvm-project/issues/95183
  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
  float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
  // 2. quantize
  vectorize_with_alignment<16>(
      row_in, row_out, hidden_size, tid, stride,
      [=] __device__(int8_t& dst, const scalar_t& src) {
@ -187,6 +194,7 @@ struct MinMax {
  __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
  // add a value to the MinMax
  __host__ __device__ MinMax& operator+=(float v) {
    min = fminf(min, v);
    max = fmaxf(max, v);
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
  const scalar_t* row_in = input + token_idx * hidden_size;
  int8_t* row_out = output + token_idx * hidden_size;
  // 1. calculate min & max
  MinMax thread_mm;
  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
                                    [&] __device__(const scalar_t& src) {
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
  const float inv_s = 1.f / scale_sh;
  const azp_t azp = azp_sh;
  // 2. quantize
  vectorize_with_alignment<16>(
      row_in, row_out, hidden_size, tid, stride,
      [=] __device__(int8_t& dst, const scalar_t& src) {
@ -322,4 +332,14 @@ void dynamic_scaled_int8_quant(
                  hidden_size);
        }
      });
-}
+}
 #ifndef USE_ROCM
 void per_token_group_quant_int8(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double int8_min, double int8_max) {
  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                             int8_min, int8_max);
 }
 #endif
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -254,7 +254,7 @@ void cutlass_moe_mm(
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100 && version_num < 110) {
+  if (version_num >= 100) {
    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
@ -262,7 +262,7 @@ void cutlass_moe_mm(
  }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90 && version_num < 100) {
+  if (version_num >= 90) {
    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
                        c_strides, per_act_token, per_out_ch);
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -14,8 +14,6 @@
 * limitations under the License.
 */
 #include "core/registration.h"
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
      "12.8 or above.");
 #endif
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
 }
--- a/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
@ -5,7 +5,7 @@
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
-#include "../../../../attention/attention_dtypes.h"
+#include "../../../attention/attention_dtypes.h"
 namespace vllm {
 #ifdef USE_ROCM
--- a/csrc/quantization/w8a8/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
-#include "cub_helpers.h"
+#include "../../cub_helpers.h"
-#include "quantization/vectorization_utils.cuh"
+#include "../vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
--- a/csrc/quantization/w8a8/fp8/common.cuh
+++ b/csrc/quantization/w8a8/fp8/common.cuh
--- a/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
@ -1,6 +1,6 @@
 #pragma once
-#include "../../../../attention/attention_dtypes.h"
+#include "../../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@ -1,6 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
-#include "quantization/w8a8/per_token_group_quant_8bit.h"
+#include "../per_token_group_quant_8bit.h"
 #include <cmath>
@ -8,9 +8,9 @@
 #include <torch/all.h>
-#include "quantization/vectorization.cuh"
+#include "../vectorization.cuh"
-#include "quantization/vectorization_utils.cuh"
+#include "../vectorization_utils.cuh"
-#include "dispatch_utils.h"
+#include "../../dispatch_utils.h"
 __device__ __forceinline__ float GroupReduceMax(float val) {
  unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
@ -212,4 +212,4 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                               double fp8_max, bool scale_ue8m0) {
  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                             fp8_min, fp8_max, scale_ue8m0);
-}
+}
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@ -6,7 +6,7 @@
 #include "quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"
 namespace vllm {
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -17,32 +17,28 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
                  (128, 64, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 # group_blocks:
@ -63,12 +59,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
@ -98,7 +93,8 @@ def generate_new_kernels():
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
                    group_blocks == 4:
                # HQQ (is_zp_float = true) only supports
                # 4bit quantization and fp16
                is_zp_float_list.append(True)
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -12,21 +12,20 @@ from functools import reduce
 from typing import Optional, Union
 import jinja2
-from vllm_cutlass_library_extension import (
+# yapf conflicts with isort for this block
-    DataType,
+# yapf: disable
-    EpilogueScheduleTag,
+from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
-    EpilogueScheduleType,
+                                            EpilogueScheduleType,
-    MixedInputKernelScheduleType,
+                                            MixedInputKernelScheduleType,
-    TileSchedulerTag,
+                                            TileSchedulerTag,
-    TileSchedulerType,
+                                            TileSchedulerType, VLLMDataType,
-    VLLMDataType,
+                                            VLLMDataTypeNames,
-    VLLMDataTypeNames,
+                                            VLLMDataTypeSize, VLLMDataTypeTag,
-    VLLMDataTypeSize,
+                                            VLLMDataTypeTorchDataTypeTag,
-    VLLMDataTypeTag,
+                                            VLLMDataTypeVLLMScalarTypeTag,
-    VLLMDataTypeTorchDataTypeTag,
+                                            VLLMKernelScheduleTag)
-    VLLMDataTypeVLLMScalarTypeTag,
+
-    VLLMKernelScheduleTag,
+# yapf: enable
 )
 #
 #   Generator templating
@ -287,23 +286,18 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
    tile_shape = (
        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
    )
-    cluster_shape = (
+    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
-        f"{schedule_config.cluster_shape_mnk[0]}"
+                     f"x{schedule_config.cluster_shape_mnk[1]}" +
-        + f"x{schedule_config.cluster_shape_mnk[1]}"
+                     f"x{schedule_config.cluster_shape_mnk[2]}")
-        + f"x{schedule_config.cluster_shape_mnk[2]}"
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
-    )
+        .split("::")[-1]
-    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
+    epilogue_schedule = EpilogueScheduleTag[
-        "::"
+        schedule_config.epilogue_schedule].split("::")[-1]
-    )[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
-    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
+        .split("::")[-1]
        "::"
    )[-1]
    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
-    return (
+    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
-        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+            f"_{epilogue_schedule}_{tile_scheduler}")
        + f"_{epilogue_schedule}_{tile_scheduler}"
    )
 # mostly unique shorter sch_sig
@ -322,24 +316,18 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
 # unique type_name
 def generate_type_signature(kernel_types: TypeConfig):
-    return str(
+    return str("".join([
-        "".join(
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            [
+        for field in fields(TypeConfig)
-                VLLMDataTypeNames[getattr(kernel_types, field.name)]
+    ]))
                for field in fields(TypeConfig)
            ]
        )
    )
 def generate_type_option_name(kernel_types: TypeConfig):
-    return ", ".join(
+    return ", ".join([
-        [
+        f"{field.name.replace('b_', 'with_')+'_type'}=" +
-            f"{field.name.replace('b_', 'with_') + '_type'}="
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
-            for field in fields(TypeConfig)
+    ])
        ]
    )
 def is_power_of_two(n):
@ -347,6 +335,7 @@ def is_power_of_two(n):
 def to_cute_constant(value: list[int]):
    def _to_cute_constant(value: int):
        if is_power_of_two(value):
            return f"_{value}"
@ -361,11 +350,11 @@ def to_cute_constant(value: list[int]):
 def unique_schedules(impl_configs: list[ImplConfig]):
    # Use dict over set for deterministic ordering
-    return list(
+    return list({
-        {
+        sch: None
-            sch: None for impl_config in impl_configs for sch in impl_config.schedules
+        for impl_config in impl_configs
-        }.keys()
+        for sch in impl_config.schedules
-    )
+    }.keys())
 def unsigned_type_with_bitwidth(num_bits):
@ -391,7 +380,7 @@ template_globals = {
    "gen_type_sig": generate_type_signature,
    "unique_schedules": unique_schedules,
    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
-    "gen_type_option_name": generate_type_option_name,
+    "gen_type_option_name": generate_type_option_name
 }
@ -409,28 +398,23 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
    sources = []
-    sources.append(
+    sources.append((
-        (
+        "machete_mm_dispatch",
-            "machete_mm_dispatch",
+        mm_dispatch_template.render(impl_configs=impl_configs),
-            mm_dispatch_template.render(impl_configs=impl_configs),
+    ))
        )
    )
    prepack_types = []
    for impl_config in impl_configs:
-        convert_type = (
+        convert_type = impl_config.types.a \
-            impl_config.types.a
+             if impl_config.types.b_group_scale == DataType.void \
-            if impl_config.types.b_group_scale == DataType.void
+             else impl_config.types.b_group_scale
            else impl_config.types.b_group_scale
        )
        prepack_types.append(
            PrepackTypeConfig(
                a=impl_config.types.a,
                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
                convert=convert_type,
                accumulator=impl_config.types.accumulator,
-            )
+            ))
        )
    def prepacked_type_key(prepack_type: PrepackTypeConfig):
        # For now, we can just use the first accumulator type seen since
@ -446,14 +430,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            unique_prepack_types.append(prepack_type)
            prepack_types_seen.add(key)
-    sources.append(
+    sources.append((
-        (
+        "machete_prepack",
-            "machete_prepack",
+        prepack_dispatch_template.render(types=unique_prepack_types, ),
-            prepack_dispatch_template.render(
+    ))
                types=unique_prepack_types,
            ),
        )
    )
    # Split up impls across files
    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
@ -486,12 +466,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
        curr_impl_in_file += len(files_impls[-1][-1].schedules)
    for part, file_impls in enumerate(files_impls):
-        sources.append(
+        sources.append((
-            (
+            f"machete_mm_impl_part{part+1}",
-                f"machete_mm_impl_part{part + 1}",
+            mm_impl_template.render(impl_configs=file_impls),
-                mm_impl_template.render(impl_configs=file_impls),
+        ))
            )
        )
    return sources
@ -536,7 +514,8 @@ def generate():
    # For now we use the same heuristic for all types
    # Heuristic is currently tuned for H100s
    default_heuristic = [
-        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
+        (cond, ScheduleConfig(*tile_config,
                              **sch_common_params))  # type: ignore
        for cond, tile_config in default_tile_heuristic_config.items()
    ]
@ -562,18 +541,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(GPTQ_kernel_type_configs,
-            GPTQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    AWQ_kernel_type_configs = list(
@ -586,18 +561,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (DataType.u4, DataType.u8)
-        for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(AWQ_kernel_type_configs,
-            AWQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    # TODO: Support W4A8 when ready
--- a/csrc/quantization/w8a8/per_token_group_quant_8bit.h
+++ b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
@ -1,6 +1,7 @@
 #pragma once
 #include <torch/all.h>
 // TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                torch::Tensor& output_q,
--- a/csrc/quantization/w8a8/int8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
@ -1,12 +0,0 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include "quantization/w8a8/per_token_group_quant_8bit.h"
 void per_token_group_quant_int8(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double int8_min, double int8_max) {
  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                             int8_min, int8_max);
 }
--- a/Show More
+++ b/Show More