fix precommit

[Quantization] Modify the logic of BNB double quantization (#19742 )
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-18 21:17:43 -07:00 · 2025-06-19 03:52:09 +00:00 · 2025-06-18 20:25:15 -07:00 · 2025-06-19 09:53:55 +08:00 · 2025-06-18 17:41:11 -07:00 · 2025-06-19 08:23:12 +08:00
1628 changed files with 29256 additions and 7170 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import sys
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import os
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path

 import pytest
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
 import os
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from lmdeploy.serve.openai.api_client import APIClient

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import datetime
 import json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,5 +1,6 @@
 steps:
  - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -11,6 +12,7 @@ steps:
      DOCKER_BUILDKIT: "1"

  - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -28,6 +30,7 @@ steps:

  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -44,6 +47,7 @@ steps:

  - label: "Build release image"
    depends_on: block-release-image-build
+    id: build-release-image
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -51,6 +55,18 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
@ -70,9 +86,10 @@ steps:
      DOCKER_BUILDKIT: "1"

  - input: "Provide Release version here"
+    id: input-release-version
    fields:
      - text: "What is the release version?"
-        key: "release-version"
+        key: release-version

  - block: "Build CPU release image"
    key: block-cpu-release-image-build
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
--- a/.buildkite/scripts/ci-clean-log.sh
+++ b/.buildkite/scripts/ci-clean-log.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi

+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
@ -37,7 +38,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -6,75 +6,82 @@ set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
 # Setup cleanup
 remove_docker_container() { 
    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3
+
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"

  # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"

  # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

  # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
    tests/quantization/test_ipex_quant.py"

  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  

  # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
      --backend vllm \
      --dataset-name random \
      --model facebook/opt-125m \
@ -83,7 +90,7 @@ function cpu_tests() {
      --tokenizer facebook/opt-125m"

  # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
@ -91,4 +98,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -150,11 +150,15 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 12 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"

 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/rerun-test.sh
+++ b/.buildkite/scripts/rerun-test.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -145,6 +145,7 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
+  - tests/v1/engine/test_engine_core_client.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -154,6 +155,7 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -175,6 +177,11 @@ steps:
  - tests/tracing
  commands:
  - pytest -v -s metrics
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
  - pytest -v -s tracing

 ##### fast check tests  #####
@ -287,7 +294,7 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
@ -303,6 +310,7 @@ steps:
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
@ -318,6 +326,7 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/piecewise/test_full_cudagraph.py

 - label: PyTorch Fullgraph Test # 18min
  mirror_hardwares: [amdexperimental, amdproduction]
@ -421,6 +430,9 @@ steps:
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
@ -663,7 +675,7 @@ steps:
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

 - label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -10,15 +10,17 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
+/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm
+/vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb
+/vllm/v1/structured_output @mgoin @russellb @aarnphm

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee

 # Docs
 /docs @hmellor
-mkdocs.yaml @hmellor
+mkdocs.yaml @hmellor
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -8,6 +8,16 @@ body:
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+      
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
  attributes:
    label: Your current environment
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,6 +1,18 @@
-FILL IN THE PR DESCRIPTION HERE
+## Essential Elements of an Effective PR Description Checklist
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.

-FIX #xxxx (*link existing issues this PR will resolve*)
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+## Purpose
+
+## Test Plan
+
+## Test Result
+
+## (Optional) Documentation Update

 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -36,6 +36,20 @@ pull_request_rules:
      add:
        - frontend

+- name: label-llama
+  description: Automatically apply llama label
+  conditions:
+    - or:
+      - files~=^examples/.*llama.*\.py
+      - files~=^tests/.*llama.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+      - files~=^vllm/model_executor/models/.*llama.*\.py
+      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+  actions:
+    label:
+      add:
+        - llama
+
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
@ -51,6 +65,41 @@ pull_request_rules:
      add:
        - multi-modality

+- name: label-qwen
+  description: Automatically apply qwen label
+  conditions:
+    - or:
+      - files~=^examples/.*qwen.*\.py
+      - files~=^tests/.*qwen.*\.py
+      - files~=^vllm/model_executor/models/.*qwen.*\.py
+      - files~=^vllm/reasoning/.*qwen.*\.py
+      - title~=(?i)Qwen
+      - body~=(?i)Qwen
+  actions:
+    label:
+      add:
+        - qwen
+
+- name: label-rocm
+  description: Automatically apply rocm label
+  conditions:
+    - or:
+      - files~=^csrc/rocm/
+      - files~=^docker/Dockerfile.rocm
+      - files~=^requirements/rocm.*\.txt
+      - files~=^vllm/attention/backends/rocm.*\.py
+      - files~=^vllm/attention/ops/rocm.*\.py
+      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^tests/kernels/.*_rocm.*\.py
+      - files=vllm/platforms/rocm.py
+      - title~=(?i)AMD
+      - title~=(?i)ROCm
+  actions:
+    label:
+      add:
+        - rocm
+
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
--- a/.gitignore
+++ b/.gitignore
@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/

-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,6 +11,8 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
@ -18,12 +20,10 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
  hooks:
-  - id: codespell
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
+  - id: typos
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
@ -143,6 +143,13 @@ repos:
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/check_pickle_imports.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -182,9 +182,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

-#
-# Set rocm version dev int.
-#
 if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -192,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")

-
  #
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -246,6 +242,7 @@ set(VLLM_EXT_SRC
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -311,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)

    #
@ -423,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
+  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
+  # require CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -457,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # kernels for the remaining archs that are not already built for 3x.
  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -545,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # CUTLASS MoE kernels

-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@ -687,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)

    #
--- a/README.md
+++ b/README.md
@ -58,8 +58,8 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
 - Speculative decoding
 - Chunked prefill

@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 - Prefix caching support
 - Multi-LoRA support

 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g. E5-Mistral)
+- Embedding Models (e.g., E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)

 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -156,10 +156,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)

 ## Media Kit

- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -10,11 +10,15 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
 #   OUTPUT_LEN: request output len
 #   MIN_CACHE_HIT_PCT: prefix cache rate
 #   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
+#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
+#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
 # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
 # 5. The final result will be saved in RESULT file. 

@ -30,31 +34,27 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT_PCT=0
+MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"

-echo "result file$ $RESULT"
+echo "result file: $RESULT"
 echo "model: $MODEL"
-echo

 rm -rf $LOG_FOLDER
 mkdir -p $LOG_FOLDER

 cd "$BASE/vllm"
-# create sonnet-4x.txt so that we can sample 2048 tokens for input
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
-do
-cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done

-pip install datasets
+pip install -q datasets

 current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
@ -64,53 +64,69 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
+
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    
+    pkill -f vllm
+
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization $gpu_memory_utilization \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size $TP \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir "$DOWNLOAD_DIR" \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do  
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
+            server_started=1
+            break
+        else
+            sleep 10
+        fi
+    done
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+        return 1
+    else
+        return 0
+    fi
+}
+
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
+    pkill -f vllm

-    # start the server
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
-        --disable-log-requests \
-        --port 8004 \
-        --gpu-memory-utilization 0.98 \
-        --max-num-seqs $max_num_seqs \
-        --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size 1 \
-        --enable-prefix-caching \
-        --load-format dummy \
-        --download-dir $DOWNLOAD_DIR \
-        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-    echo "wait for 10 minutes.."
-    echo
-    # wait for 10 minutes...
-    server_started=0
-    for i in {1..60}; do        
-        if grep -Fq "Application startup complete" "$vllm_log"; then
-            echo "Application started"
-            server_started=1
-            break
-        else
-            # echo "wait for 10 seconds..."
-            sleep 10
-        fi
-    done
- 
-    if (( ! server_started )); then
-        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
-        echo "pkill -f vllm"
-        echo
-        pkill vllm
-        sleep 10
-        return 1
+    echo "starting server..."
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
    fi
+    echo
    
    echo "run benchmark test..."
-    echo
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -118,29 +134,29 @@ run_benchmark() {
    python benchmarks/benchmark_serving.py \
        --backend vllm \
        --model $MODEL  \
-        --dataset-name sonnet \
-        --dataset-path benchmarks/sonnet_4x.txt \
-        --sonnet-input-len $INPUT_LEN \
-        --sonnet-output-len $OUTPUT_LEN \
+        --dataset-name random \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 100 \
-        --sonnet-prefix-len $prefix_len \
-        --port 8004 > "$bm_log"
-    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --port 8004 &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')

    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
        meet_latency_requirement=1
+        request_rate=inf
    fi

    if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(through_put) + 1
-        request_rate=$((${through_put%.*} + 1))
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@ -149,19 +165,18 @@ run_benchmark() {
            python benchmarks/benchmark_serving.py \
                --backend vllm \
                --model $MODEL  \
-                --dataset-name sonnet \
-                --dataset-path benchmarks/sonnet_4x.txt \
-                --sonnet-input-len $INPUT_LEN \
-                --sonnet-output-len $OUTPUT_LEN \
-                --ignore_eos \
+                --dataset-name random \
+                --random-input-len $INPUT_LEN \
+                --random-output-len $OUTPUT_LEN \
+                --ignore-eos \
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
-                --sonnet-prefix-len $prefix_len \
-                --port 8004 > "$bm_log"
-            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+                --random-prefix-len $prefix_len \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -173,10 +188,10 @@ run_benchmark() {
    fi
    # write the results and update the best result.
    if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
-            best_throughput=$through_put
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
@ -188,22 +203,39 @@ run_benchmark() {

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

-    echo "pkill -f vllm"
-    echo
    pkill vllm
    sleep 10
-    rm -f $vllm_log
    printf '=%.0s' $(seq 1 20)
    return 0
 }

+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"

-num_seqs_list="128 256"
-num_batched_tokens_list="512 1024 2048 4096"
-for num_seqs in $num_seqs_list; do
-    for num_batched_tokens in $num_batched_tokens_list; do
-        run_benchmark $num_seqs $num_batched_tokens
-        exit 0
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
+
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
    done
 done
 echo "finish permutations"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import io
 import json
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -864,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
            prompt_len = len(tokenizer(prompt).input_ids)
            sampled_requests.append(
                SampleRequest(
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""

 import argparse
@ -122,7 +123,7 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
        "requests till completion."
@ -170,6 +171,12 @@ if __name__ == "__main__":
    # V1 enables prefix caching by default which skews the latency
    # numbers. We need to disable prefix caching by default.
    parser.set_defaults(enable_prefix_caching=False)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
        raise OSError(
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Offline benchmark to test the long document QA throughput.

@ -141,7 +142,7 @@ def main(args):
    )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the performance with or "
        "without automatic prefix caching."
@ -191,5 +192,11 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the efficiency of prefix caching.

@ -217,7 +218,7 @@ def main(args):
    )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the performance with or without "
        "automatic prefix caching."
@ -267,5 +268,11 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline prioritization."""

 import argparse
@ -160,7 +161,7 @@ def main(args: argparse.Namespace):
            json.dump(results, f, indent=4)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument(
        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@ -203,6 +204,12 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
@ -874,7 +875,7 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, result_json, file_name)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the online serving throughput."
    )
@ -1224,6 +1225,10 @@ if __name__ == "__main__":
        "script chooses a LoRA module at random.",
    )

-    args = parser.parse_args()
+    return parser

+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput with structured outputs.

 On the server side, run one of the following commands:
@ -11,7 +12,6 @@ On the client side, run:
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
        --request-rate 10 \
        --num-prompts 1000

@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
            json.dump(results, outfile, indent=4)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the online serving throughput."
    )
@ -1034,5 +1034,10 @@ if __name__ == "__main__":
        help="Ratio of Structured Outputs requests",
    )

+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""

 import argparse
@ -594,7 +595,7 @@ def validate_args(args):
        )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument(
        "--backend",
@ -716,6 +717,12 @@ if __name__ == "__main__":
    )

    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):

 def write_to_json(filename: str, records: list) -> None:
    with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Cutlass bench utils
 from collections.abc import Iterable
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os

--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
 import itertools
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json

--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pickle as pkl
 import time
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -4,11 +4,85 @@ import copy
 import itertools

 import torch
-import triton
 from weight_shapes import WEIGHT_SHAPES

 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "fp8-tensor-w-token-a": dict(
+        w="tensor", a="token", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a": dict(
+        w="tensor", a="tensor", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-token-a": dict(
+        w="channel", a="token", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-tensor-a": dict(
+        w="channel", a="tensor", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-token-a-noquant": dict(
+        w="tensor", a="token", no_a_quant=True, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a-noquant": dict(
+        w="tensor", a="tensor", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-token-a-noquant": dict(
+        w="channel", a="token", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-tensor-a-noquant": dict(
+        w="channel", a="tensor", no_a_quant=True, enabled=False
+    ),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
+    if w_type == "tensor":
+        scale_b = torch.ones(1, device=device, dtype=torch.float32)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+    else:
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
+    return b_fp8.t(), scale_b_fp8
+
+
+def build_fp8_runner(cfg, a, b, dtype, device):
+    b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
+
+    scale_a_const = (
+        torch.ones(1, device=device, dtype=torch.float32)
+        if cfg["a"] == "tensor"
+        else None
+    )
+
+    if cfg["no_a_quant"]:
+        if cfg["a"] == "tensor":
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+        else:
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+
+        def run():
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        return run
+
+    if cfg["a"] == "tensor":
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    else:
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    return run


@triton.testing.perf_report(
@ -17,28 +91,8 @@ from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
-        line_vals=[
-            "torch-bf16",
-            # "fp8-tensor-w-token-a",
-            "fp8-tensor-w-tensor-a",
-            "fp8-channel-w-token-a",
-            # "fp8-channel-w-tensor-a",
-            # "fp8-tensor-w-token-a-noquant",
-            "fp8-tensor-w-tensor-a-noquant",
-            "fp8-channel-w-token-a-noquant",
-            # "fp8-channel-w-tensor-a-noquant",
-        ],
-        line_names=[
-            "torch-bf16",
-            # "fp8-tensor-w-token-a",
-            "fp8-tensor-w-tensor-a",
-            "fp8-channel-w-token-a",
-            # "fp8-channel-w-tensor-a",
-            # "fp8-tensor-w-token-a-noquant",
-            "fp8-tensor-w-tensor-a-noquant",
-            "fp8-channel-w-token-a-noquant",
-            # "fp8-channel-w-tensor-a-noquant",
-        ],
+        line_vals=_enabled,
+        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs FP8 GEMMs",
        args={},
@ -49,144 +103,34 @@ def benchmark(batch_size, provider, N, K):
    device = "cuda"
    dtype = torch.bfloat16

-    # Create input tensors
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]

-    if "torch-bf16" in provider:
+    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
-
-    elif "fp8" in provider:
-        # Weights are always quantized ahead of time
-        if "noquant" in provider:
-            # For no quantization, we just measure the GEMM
-            if "tensor-w-token-a" in provider:
-                # Dynamic per-token quant for A, per-tensor quant for B
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
-                assert scale_b_fp8.numel() == 1
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                    a, use_per_token_if_dynamic=True
-                )
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "tensor-w-tensor-a" in provider:
-                # Static per-tensor quantization with fixed scales
-                # for both A and B
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                assert scale_b_fp8.numel() == 1
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-token-a" in provider:
-                # Static per-channel quantization for weights, per-token
-                # quant for A
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                    a, use_per_token_if_dynamic=True
-                )
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-tensor-a" in provider:
-                # Static per-channel quantization for weights, per-tensor
-                # quant for A
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-        else:
-            # In these cases, we quantize the activations during the GEMM call
-            if "tensor-w-token-a" in provider:
-                # Dynamic per-token quant for A, per-tensor quant for B
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
-                assert scale_b_fp8.numel() == 1
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                        a, use_per_token_if_dynamic=True
-                    )
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "tensor-w-tensor-a" in provider:
-                # Static per-tensor quantization with fixed scales
-                # for both A and B
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                assert scale_b_fp8.numel() == 1
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-token-a" in provider:
-                # Static per-channel quantization for weights, per-token
-                # quant for A
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                        a, use_per_token_if_dynamic=True
-                    )
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-tensor-a" in provider:
-                # Static per-channel quantization for weights, per-tensor
-                # quant for A
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-        b_fp8 = b_fp8.t()
-
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_fp8_runner(cfg, a, b, dtype, device)
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), quantiles=quantiles
        )

-    # Calculate TFLOP/s, two flops per multiply-add
-    tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
-    return tflops(ms), tflops(max_ms), tflops(min_ms)
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)


 def prepare_shapes(args):
-    KN_model_names = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        assert model in WEIGHT_SHAPES
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
            KN.append(model)
-            KN_model_names.append(KN)
-    return KN_model_names
+            out.append(KN)
+    return out


 if __name__ == "__main__":
@ -196,21 +140,13 @@ if __name__ == "__main__":
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.1-8B-Instruct"],
-        choices=[*WEIGHT_SHAPES.keys()],
-        help="List of models to benchmark",
-    )
-    parser.add_argument(
-        "--tp-sizes",
-        nargs="+",
-        type=int,
-        default=[1],
-        help="List of tensor parallel sizes",
+        choices=list(WEIGHT_SHAPES.keys()),
    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()

-    KN_model_names = prepare_shapes(args)
-    for K, N, model_name in KN_model_names:
-        print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
        benchmark.run(
            print_data=True,
            show_plots=True,
--- a/benchmarks/kernels/bench_int8_gemm.py
+++ b/benchmarks/kernels/bench_int8_gemm.py
@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "int8-tensor-w-token-a": dict(
+        w="tensor", a="token", no_a_quant=False, enabled=False
+    ),
+    "int8-tensor-w-tensor-a": dict(
+        w="tensor", a="tensor", no_a_quant=False, enabled=True
+    ),
+    "int8-channel-w-token-a": dict(
+        w="channel", a="token", no_a_quant=False, enabled=True
+    ),
+    "int8-channel-w-tensor-a": dict(
+        w="channel", a="tensor", no_a_quant=False, enabled=False
+    ),
+    "int8-tensor-w-token-a-noquant": dict(
+        w="tensor", a="token", no_a_quant=True, enabled=False
+    ),
+    "int8-tensor-w-tensor-a-noquant": dict(
+        w="tensor", a="tensor", no_a_quant=True, enabled=True
+    ),
+    "int8-channel-w-token-a-noquant": dict(
+        w="channel", a="token", no_a_quant=True, enabled=True
+    ),
+    "int8-channel-w-tensor-a-noquant": dict(
+        w="channel", a="tensor", no_a_quant=True, enabled=False
+    ),
+}
+
+
+def _quant_weight(b, w_type, device):
+    if w_type == "tensor":
+        scale_b = torch.ones(1, device=device, dtype=torch.float32)
+        b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
+        assert scale_b_int8.numel() == 1
+    else:  # channel
+        b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
+        assert scale_b_int8.numel() == b.shape[0]
+    return b_int8.t(), scale_b_int8
+
+
+def build_int8_runner(cfg, a, b, dtype, device):
+    # quant before running the kernel
+    b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
+
+    scale_a_const = None
+    if cfg["a"] == "tensor":
+        scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
+
+    # no quant, create activation ahead
+    if cfg["no_a_quant"]:
+        if cfg["a"] == "tensor":
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
+        else:  # token
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+
+        def run_quant():
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+        return run_quant
+
+    # dynamic quant, create activation inside
+    if cfg["a"] == "tensor":
+
+        def run_quant():
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+    else:  # token
+
+        def run_quant():
+            a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
+            return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
+
+    return run_quant
+
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=[k for k in _enabled],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs INT8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_int8_runner(cfg, a, b, dtype, device)
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_int8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import sys
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
 kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -90,7 +91,7 @@ def bench_run(

    score = torch.randn((m, num_experts), device=device, dtype=dtype)

-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)

    quant_blocksize = 16
    w1_blockscale = torch.empty(
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch.utils.benchmark as benchmark
@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    cutlass_moe_fp8,
    fused_experts,
    fused_topk,
 )
@ -69,18 +70,9 @@ def bench_run(
    w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
    w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)

-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
    for expert in range(num_experts):
        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)

    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)

@ -121,10 +113,6 @@ def bench_run(
        w2_scale: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
        num_repeats: int,
    ):
        for _ in range(num_repeats):
@ -132,14 +120,10 @@ def bench_run(
                a,
                w1,
                w2,
-                w1_scale,
-                w2_scale,
                topk_weights,
                topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                a1_scale=a_scale,
            )

@ -152,10 +136,6 @@ def bench_run(
        w2_scale: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
    ):
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -164,14 +144,10 @@ def bench_run(
                a,
                w1_q,
                w2_q,
-                w1_scale,
-                w2_scale,
                topk_weights,
                topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                a1_scale=a_scale,
            )

@ -217,10 +193,6 @@ def bench_run(
            w2_scale,
            topk_weights,
            topk_ids,
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
        )
    torch.cuda.synchronize()

@ -229,8 +201,8 @@ def bench_run(
    with torch.cuda.graph(triton_graph, stream=triton_stream):
        run_triton_from_graph(
            a,
-            w1_q_notransp,
-            w2_q_notransp,
+            w1_q,
+            w2_q,
            topk_weights,
            topk_ids,
            w1_scale,
@ -249,18 +221,12 @@ def bench_run(
        "w2": w2,
        "score": score,
        "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
        # Cutlass params
        "a_scale": a_scale,
        "w1_q": w1_q,
        "w2_q": w2_q,
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -278,8 +244,8 @@ def bench_run(
    # Warmup
    run_triton_moe(
        a,
-        w1_q_notransp,
-        w2_q_notransp,
+        w1_q,
+        w2_q,
        topk_weights,
        topk_ids,
        w1_scale,
@ -290,7 +256,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -321,16 +287,12 @@ def bench_run(
        w2_scale,
        topk_weights,
        topk_ids,
-        ab_strides1,
-        c_strides1,
-        ab_strides2,
-        c_strides2,
        num_warmup,
    )

    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch.utils.benchmark as benchmark
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
@ -6,7 +7,6 @@ import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict

 import ray
@ -42,7 +42,7 @@ def benchmark_config(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    block_quant_shape: List[int] = None,
+    block_quant_shape: list[int] = None,
    use_deep_gemm: bool = False,
 ) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        block_quant_shape: List[int] = None,
+        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
-    block_quant_shape: List[int],
+    block_quant_shape: list[int],
 ) -> None:
    dtype_str = get_config_dtype_str(
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
    if args.model_prefix:
        config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)

    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
        shard_intermediate_size = 2 * intermediate_size // args.tp_size

    hidden_size = config.hidden_size
-    dtype = (
-        torch.float16
-        if current_platform.is_rocm()
-        else getattr(torch, config.torch_dtype)
-    )
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import itertools
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size_triton,
+)
+from vllm.triton_utils import triton
+
+
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    return torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
+
+
+def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
+    """
+    Verifies vllm vs. Triton
+    """
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    # 1. malloc space for triton and vllm
+    # malloc enough space (max_num_tokens_padded) for the sorted ids
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_triton = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
+    )
+    sorted_ids_triton.fill_(topk_ids.numel())  # fill with sentinel value
+    expert_ids_triton = torch.zeros(
+        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
+    sorted_ids_vllm.fill_(topk_ids.numel())
+    expert_ids_vllm = torch.zeros_like(expert_ids_triton)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
+
+    # 2. run implementations
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_vllm,
+        expert_ids_vllm,
+        num_tokens_post_pad_vllm,
+    )
+    print(f"✅ VLLM implementation works with {num_experts} experts!")
+
+    # 3. compare results
+    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
+        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
+    ):
+        print("✅ Triton and VLLM implementations match.")
+    else:
+        print("❌ Triton and VLLM implementations DO NOT match.")
+        print("Triton expert_ids:", expert_ids_triton)
+        print("VLLM expert_ids:", expert_ids_vllm)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+
+
+# test configurations
+num_tokens_range = [1, 16, 256, 4096]
+num_experts_range = [16, 64, 224, 256, 280, 512]
+topk_range = [1, 2, 8]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "triton"],  # "triton"
+        line_names=["VLLM", "Triton"],  # "Triton"
+        plot_name="moe-align-block-size-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+    """Benchmark function for Triton."""
+    block_size = 256
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: ops.moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num_experts",
+        type=int,
+        default=64,
+        choices=[8, 16, 32, 64, 128, 256],
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=8,
+        choices=[2, 4, 8],
+        help="Top-k value for correctness check.",
+    )
+    args = parser.parse_args()
+
+    print("Running correctness check...")
+    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
+    benchmark.run(print_data=True, show_plots=True)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 from typing import Any, TypedDict
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
 import time
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time

--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import itertools
 from typing import Optional, Union
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from itertools import accumulate
 from typing import Optional
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 WEIGHT_SHAPES = {
    "ideal": [[4 * 256 * 32, 256 * 32]],
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from sglang quantization/tuning_block_wise_kernel.py

 import argparse
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import math
 import pickle
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
 from collections.abc import Iterable
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import cProfile
 import pstats
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
 else()
    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
    message(WARNING "vLLM CPU backend using AVX2 ISA")
    
-elseif (POWER9_FOUND OR POWER10_FOUND)
+elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
    message(STATUS "PowerPC detected")
-    # Check for PowerPC VSX support
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mvsx"
-        "-mcpu=native"
-        "-mtune=native")
+    if (POWER9_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power9"
+            "-mtune=power9")
+    elseif (POWER10_FOUND OR POWER11_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power10"
+            "-mtune=power10")
+    endif()

 elseif (ASIMD_FOUND)
    message(STATUS "ARMv8 or later architecture detected")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
+          GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
      "-DENABLE_FP8"
      "-U__HIP_NO_HALF_CONVERSIONS__"
      "-U__HIP_NO_HALF_OPERATORS__"
+      "-Werror=unused-variable"
      "-fno-gpu-rdc")

  endif()
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
       static_cast<ElementAcc*>(nullptr), stride_LSE},
      hw_info,
-      -1,       // split_kv
+      1,        // split_kv
      nullptr,  // is_var_split_kv
  };
  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@ -193,4 +190,4 @@ void paged_attention_v1(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@ -203,4 +200,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
 }

 template <typename T>
-FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
-                                        const int size) {
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
+                                         const int size) {
  T max = max_data[0];
  for (int i = 1; i < size; ++i) {
    max = max >= max_data[i] ? max : max_data[i];
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {

        if (partition_num == 1) continue;

-        reducePartitonSoftmax(
+        reducePartitionSoftmax(
            max_logits + seq_idx * num_heads * max_num_partitions +
                head_idx * max_num_partitions,
            exp_sums + seq_idx * num_heads * max_num_partitions +
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
+  // non-temporal load
  explicit FP16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}

@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
+  // non-temporal load
  explicit BF16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}

@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}

-  // non-temproal load
+  // non-temporal load
  explicit FP32Vec16(bool, void* ptr)
      : reg((__m512)_mm512_stream_load_si512(ptr)) {}

@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
  // normal load
  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}

-  // non-temproal load
+  // non-temporal load
  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}

  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
    _mm512_mask_storeu_epi8(ptr, mask, reg);
  }

-  // non-temproal save
+  // non-temporal save
  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
 #endif
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
    int page_num = numa_migrate_pages(pid, src_mask, mask);
    if (page_num == -1) {
-      TORCH_CHECK(false,
-                  "numa_migrate_pages failed. errno: " + std::to_string(errno));
+      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
    }

    // restrict memory allocation node.
@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

  return ss.str();
 }
-#endif
+#endif
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import enum
 from typing import Union
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -13,232 +13,45 @@
 namespace vllm {
 namespace moe {

-namespace {
-__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
-                                         int32_t col) {
-  // don't worry about overflow because num_experts is relatively small
-  return row * total_col + col;
-}
-}  // namespace
-
-template <typename scalar_t, typename token_cnts_t>
-__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
-                                            int32_t* sorted_token_ids,
-                                            int32_t* expert_ids,
-                                            int32_t* total_tokens_post_pad,
-                                            int32_t num_experts,
-                                            int32_t block_size, size_t numel) {
-  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
-
-  extern __shared__ int32_t shared_mem[];
-  int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts =
-      (token_cnts_t*)(shared_mem + num_experts +
-                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
-
-  for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
-  }
-
-  /**
-   * In the first step we compute token_cnts[thread_index + 1][expert_index],
-   * which counts how many tokens in the token shard of thread_index are
-   * assigned to expert expert_index.
-   */
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
-  }
-
-  __syncthreads();
-
-  // For each expert we accumulate the token counts from the different threads.
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
-          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
-    }
-  }
-
-  __syncthreads();
-
-  // We accumulate the token counts of all experts in thread 0.
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      cumsum[i] = cumsum[i - 1] +
-                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
-                          block_size) *
-                      block_size;
-    }
-    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
-  }
-
-  __syncthreads();
-
-  /**
-   * For each expert, each thread processes the tokens of the corresponding
-   * blocks and stores the corresponding expert_id for each block.
-   */
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
-    }
-  }
-
-  /**
-   * Each thread processes a token shard, calculating the index of each token
-   * after sorting by expert number. Given the example topk_ids =
-   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
-   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
-   * padding value(preset in python).
-   */
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    /** The cumsum[expert_id] stores the starting index of the tokens that the
-     * expert with expert_id needs to process, and
-     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
-     * processed by the expert with expert_id within the current thread's token
-     * shard.
-     */
-    int32_t rank_post_pad =
-        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
-        cumsum[expert_id];
-    sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
-  }
-}
-
-// TODO(simon): this is temporarily adapted from
-// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
-// we did this to unblock Deepseek V3 but there should be a better
-// implementation to manage shared memory.
 template <typename scalar_t>
-__global__ void moe_align_block_size_global_mem_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
-    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
-  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+    size_t numel, int32_t* __restrict__ cumsum) {
+  extern __shared__ int32_t shared_counts[];

-  for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
-  }
-
-  /**
-   * In the first step we compute token_cnts[thread_index + 1][expert_index],
-   * which counts how many tokens in the token shard of thread_index are
-   * assigned to expert expert_index.
-   */
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
-  }
-
-  __syncthreads();
-
-  // For each expert we accumulate the token counts from the different threads.
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
-          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
-    }
-  }
-
-  __syncthreads();
-
-  // We accumulate the token counts of all experts in thread 0.
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      cumsum[i] = cumsum[i - 1] +
-                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
-                          block_size) *
-                      block_size;
-    }
-    *total_tokens_post_pad = cumsum[num_experts];
-  }
-
-  __syncthreads();
-
-  /**
-   * For each expert, each thread processes the tokens of the corresponding
-   * blocks and stores the corresponding expert_id for each block.
-   */
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
-    }
-  }
-
-  /**
-   * Each thread processes a token shard, calculating the index of each token
-   * after sorting by expert number. Given the example topk_ids =
-   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
-   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
-   * padding value(preset in python).
-   */
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    /** The cumsum[expert_id] stores the starting index of the tokens that the
-     * expert with expert_id needs to process, and
-     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
-     * processed by the expert with expert_id within the current thread's token
-     * shard.
-     */
-    int32_t rank_post_pad =
-        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
-        cumsum[expert_id];
-    sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
-  }
-}
-
-// taken from
-// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
-template <typename scalar_t>
-__global__ void sgl_moe_align_block_size_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
-    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t* cumsum) {
-  __shared__ int32_t shared_counts[32][8];
-
-  const int warp_id = threadIdx.x / 32;
-  const int experts_per_warp = 8;
+  const int warp_id = threadIdx.x / WARP_SIZE;
  const int my_expert_start = warp_id * experts_per_warp;

-  // Initialize shared_counts for this warp's experts
  for (int i = 0; i < experts_per_warp; ++i) {
-    if (my_expert_start + i < num_experts) {
-      shared_counts[warp_id][i] = 0;
+    if (my_expert_start + i < padded_num_experts) {
+      shared_counts[warp_id * experts_per_warp + i] = 0;
    }
  }

  __syncthreads();

-  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;

-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+  for (size_t i = tid; i < numel; i += stride) {
    int expert_id = topk_ids[i];
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
-    atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
+    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
  }

  __syncthreads();

-  // Single thread computes cumulative sum and total tokens
  if (threadIdx.x == 0) {
    cumsum[0] = 0;
    for (int i = 1; i <= num_experts; ++i) {
      int expert_count = 0;
      int warp_idx = (i - 1) / experts_per_warp;
      int expert_offset = (i - 1) % experts_per_warp;
-      expert_count = shared_counts[warp_idx][expert_offset];
+      expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];

      cumsum[i] =
          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(

  __syncthreads();

-  // Assign expert IDs to blocks
  if (threadIdx.x < num_experts) {
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
         i += block_size) {
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
  }
 }

-// taken from
-// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
 template <typename scalar_t>
-__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
-                                          int32_t* sorted_token_ids,
-                                          int32_t* cumsum_buffer,
-                                          size_t numel) {
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    size_t numel) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;

@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
  }
 }

+template <typename scalar_t>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel) {
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[threadIdx.x] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[i * num_experts + threadIdx.x] +=
+          tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] =
+          cumsum[i - 1] +
+          CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
+              block_size;
+    }
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad =
+        tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+  }
+}
+
 }  // namespace moe
 }  // namespace vllm

+// taken from
+// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad) {
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  int device_max_shared_mem;
-  auto dev = topk_ids.get_device();
-  cudaDeviceGetAttribute(&device_max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-
-  const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-  const int32_t shared_mem_i32 =
-      ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
-  const int32_t shared_mem_i16 =
-      ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
-      (num_experts + 1) * sizeof(int32_t);
-
-  bool use_global_memory = false;
-  bool use_i16 = false;  // Use uint16_t for shared memory token counts
-  if (shared_mem_i32 < device_max_shared_mem) {
-    // Do nothing in this case. We're all set to use int32_t token counts
-  } else if (shared_mem_i16 < device_max_shared_mem &&
-             topk_ids.numel() <= 65535) {
-    // when nelements of topk_ids is smaller than 65535 (max value of uint16),
-    // element value of token_cnts would also smaller than 65535,
-    // so we can use uint16 as dtype of token_cnts
-    use_i16 = true;
-  } else {
-    use_global_memory = true;
-  }
-
-  if (use_global_memory) {
-    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
-        topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
-          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-          // tensors
-          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-
-          auto options_int = torch::TensorOptions()
-                                 .dtype(torch::kInt)
-                                 .device(topk_ids.device());
-          torch::Tensor token_cnts_buffer =
-              torch::empty({(num_experts + 1) * num_experts}, options_int);
-          torch::Tensor cumsum_buffer =
-              torch::empty({num_experts + 1}, options_int);
-
-          auto kernel =
-              vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
-          kernel<<<1, num_thread, 0, stream>>>(
-              topk_ids.data_ptr<scalar_t>(),
-              sorted_token_ids.data_ptr<int32_t>(),
-              experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>());
-        });
-  } else if (use_i16) {
-    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
-        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          // set dynamic shared mem
-          auto kernel =
-              vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
-          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-              (void*)kernel, shared_mem_i16));
-          kernel<<<1, num_thread, shared_mem_i16, stream>>>(
-              topk_ids.data_ptr<scalar_t>(),
-              sorted_token_ids.data_ptr<int32_t>(),
-              experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel());
-        });
-  } else {
-    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
-        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          auto kernel =
-              vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
-          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-              (void*)kernel, shared_mem_i32));
-          kernel<<<1, num_thread, shared_mem_i32, stream>>>(
-              topk_ids.data_ptr<scalar_t>(),
-              sorted_token_ids.data_ptr<int32_t>(),
-              experts_ids.data_ptr<int32_t>(),
-              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel());
-        });
-  }
-}
-
-void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                              int64_t block_size,
-                              torch::Tensor sorted_token_ids,
-                              torch::Tensor experts_ids,
-                              torch::Tensor num_tokens_post_pad) {
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  TORCH_CHECK(num_experts == 256,
-              "sgl_moe_align_block_size kernel only supports deepseek v3.");
+  int64_t padded_num_experts =
+      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+  int experts_per_warp = WARP_SIZE;
+  int threads = 1024;
+  threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;

  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
-      topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
+      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
        // calc needed amount of shared mem for `cumsum` tensors
        auto options_int =
            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
        torch::Tensor cumsum_buffer =
            torch::zeros({num_experts + 1}, options_int);
+        bool small_batch_expert_mode =
+            (topk_ids.numel() < 1024) && (num_experts <= 64);

-        auto align_kernel =
-            vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
-        align_kernel<<<1, 1024, 0, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-            topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+        if (small_batch_expert_mode) {
+          const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
+          const int32_t shared_mem_size =
+              ((threads + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);

-        const int block_threads = 256;
-        const int num_blocks =
-            (topk_ids.numel() + block_threads - 1) / block_threads;
-        const int max_blocks = 65535;
-        const int actual_blocks = std::min(num_blocks, max_blocks);
-        auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
-        sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
+          auto small_batch_expert_kernel =
+              vllm::moe::moe_align_block_size_small_batch_expert_kernel<
+                  scalar_t>;
+          small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        } else {
+          auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+
+          size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
+          size_t shared_mem_size =
+              num_warps * experts_per_warp * sizeof(int32_t);
+
+          align_kernel<<<1, threads, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
+              padded_num_experts, experts_per_warp, block_size,
+              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+
+          const int block_threads = std::min(256, (int)threads);
+          const int num_blocks =
+              (topk_ids.numel() + block_threads - 1) / block_threads;
+          const int max_blocks = 65535;
+          const int actual_blocks = std::min(num_blocks, max_blocks);
+
+          auto sort_kernel =
+              vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
+          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
+        }
      });
 }

--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
-
-void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                              int64_t block_size,
-                              torch::Tensor sorted_token_ids,
-                              torch::Tensor experts_ids,
-                              torch::Tensor num_tokens_post_pad);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
@ -30,4 +24,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             int64_t BLOCK_SIZE_K, int64_t bit);
 #endif

-bool moe_permute_unpermute_supported();
+bool moe_permute_unpermute_supported();
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor);
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -12,7 +12,7 @@ void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
    const torch::Tensor& topk_weights,               //[n_token, topk]
    torch::Tensor& topk_ids,                         // [n_token, topk]
-    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
-  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
-              "token_expert_indicies must be int32");
+  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
+              "token_expert_indices must be int32");
  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
              "src_row_id2dst_row_id_map must be int32");
  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
              "expert_first_token_offset shape != n_local_expert+1")
  TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
-      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
  auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
                    get_ptr<int>(permuted_experts_id),
                    get_ptr<int>(dst_row_id2src_row_id_map),
                    get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -130,11 +130,67 @@ void moe_unpermute(
  });
 }

+template <typename T>
+__global__ void shuffleInputRowsKernel(const T* input,
+                                       const int32_t* dst2src_map, T* output,
+                                       int64_t num_src_rows,
+                                       int64_t num_dst_rows, int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor) {
+  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
+              "Input and output tensors must have the same data type");
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  int64_t const blocks = output_tensor.size(0);
+  int64_t const threads = 256;
+  int64_t const num_dest_rows = output_tensor.size(0);
+  int64_t const num_src_rows = input_tensor.size(0);
+  int64_t const num_cols = input_tensor.size(1);
+
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
+}
+
 #else

 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& topk_ids,
-                 const torch::Tensor& token_expert_indicies,
+                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
                 const std::optional<int64_t>& align_block_size,
@ -147,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,

 void moe_unpermute(const torch::Tensor& input,
                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indicies,
+                   const torch::Tensor& token_expert_indices,
                   const std::optional<torch::Tensor>& expert_map,
                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
                   const std::optional<int64_t>& align_block_size,
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@ -14,12 +14,13 @@
    __VA_ARGS__();                                         \
    break;                                                 \
  }
-#define MOE_DISPATCH_FLOAT_CASE(...)                          \
-  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
-  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
-  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+#define MOE_DISPATCH_FLOAT_CASE(...)                            \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)

 #define MOE_DISPATCH(TYPE, ...) \
  MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -39,6 +40,11 @@ template <>
 struct ScalarType2CudaType<at::ScalarType::BFloat16> {
  using type = __nv_bfloat16;
 };
+// uint8 for packed fp4
+template <>
+struct ScalarType2CudaType<at::ScalarType::Byte> {
+  using type = uint8_t;
+};

 // #if __CUDA_ARCH__ >= 890
 // fp8
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel(
  int expert_id = sorted_experts[expanded_dest_row];

  extern __shared__ int64_t smem_expert_first_token_offset[];
-  int64_t align_expanded_row_accumulate = 0;
  if constexpr (ALIGN_BLOCK_SIZE) {
    // load g2s
    for (int idx = threadIdx.x; idx < num_local_experts + 1;
@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel(
    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;

    // Duplicate and permute rows
-    int64_t const source_k_rank = expanded_source_row / num_rows;
    int64_t const source_row = expanded_source_row % num_rows;

    auto const* source_row_ptr =
@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel(
       elem_index += stride) {
    ComputeElem thread_output;
    thread_output.fill(0);
-    float row_rescale{0.f};
    for (int k_idx = 0; k_idx < k; ++k_idx) {
      int64_t const expanded_original_row = original_row + k_idx * num_rows;
      int64_t const expanded_permuted_row =
@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel(
      auto const* expanded_permuted_rows_row_ptr =
          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;

-      int64_t const expert_idx = expert_for_source_row[k_offset];
-
      ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
          expanded_permuted_rows_row_ptr[elem_index]);
      thread_output = thread_output + row_scale * (expert_result);
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f

 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
-        gating_output, nullptr, topk_weights, topk_indicies,            \
+        gating_output, nullptr, topk_weights, topk_indices,            \
        token_expert_indices, num_tokens, topk, 0, num_experts,         \
        stream);

@ -433,7 +433,7 @@ template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
    const float* gating_output,
    float* topk_weights,
-    IndType* topk_indicies,
+    IndType* topk_indices,
    int* token_expert_indices,
    float* softmax_workspace,
    const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
                num_experts, topk, 0, num_experts);
        }
    }
@ -516,9 +516,8 @@ void topk_softmax(
            topk,
            stream);
    }
-    else
+    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
    {
-        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
@ -530,4 +529,17 @@ void topk_softmax(
            topk,
            stream);
    }
+    else {
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

-  // temporarily adapted from
-  // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
-  m.def(
-      "sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
-      "                         int block_size, Tensor! sorted_token_ids,"
-      "                         Tensor! experts_ids,"
-      "                         Tensor! num_tokens_post_pad) -> ()");
-  m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
-
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
@ -66,7 +57,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {

  m.def(
      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
-      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
@ -81,6 +72,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def("moe_permute_unpermute_supported() -> bool");
  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);

+  // Row shuffle for MoE
+  m.def(
+      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
+      "output_tensor) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+
 #endif
 }

--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void apply_repetition_penalties_(torch::Tensor& logits,
+                                 const torch::Tensor& prompt_mask,
+                                 const torch::Tensor& output_mask,
+                                 const torch::Tensor& repetition_penalties);
+
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
                               double epsilon);
@ -231,7 +236,8 @@ void cutlass_moe_mm(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);

 void cutlass_fp4_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k);

 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                           torch::Tensor const& b,
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@ -274,7 +274,6 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

-  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,15 +1,17 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
+
 #include <cmath>

 #include "../../dispatch_utils.h"
+#include "../vectorization_utils.cuh"

 #ifndef USE_ROCM
-  #include <cub/util_type.cuh>
  #include <cub/cub.cuh>
+  #include <cub/util_type.cuh>
 #else
-  #include <hipcub/util_type.hpp>
  #include <hipcub/hipcub.hpp>
+  #include <hipcub/util_type.hpp>
 #endif

 static inline __device__ int8_t float_to_int8_rn(float x) {
@ -103,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {

 namespace vllm {

-template <typename scalar_t, typename scale_type>
+template <typename scalar_t, typename scale_t>
 __global__ void static_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;

  // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;

-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
-  }
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) / scale);
+      });
 }

-template <typename scalar_t, typename scale_type, typename azp_type>
+template <typename scalar_t, typename scale_t, typename azp_t>
 __global__ void static_scaled_int8_azp_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, azp_type const* azp_ptr,
-    const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
-  azp_type const azp = *azp_ptr;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;
+  const azp_t azp = *azp_ptr;
+  const float inv_s = 1.0f / scale;

  // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;

-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    auto const val = static_cast<float>(input[i]);
-    auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
-    out[i] = quant_val;
-  }
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
 }

-template <typename scalar_t, typename scale_type>
+template <typename scalar_t, typename scale_t>
 __global__ void dynamic_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  float absmax_val = 0.0f;
-  float const zero = 0.0f;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;

  // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;

-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    float val = static_cast<float>(input[i]);
-    val = val > zero ? val : -val;
-    absmax_val = val > absmax_val ? val : absmax_val;
+  // calculate for absmax
+  float thread_max = 0.f;
+  for (int i = tid; i < hidden_size; i += stride) {
+    const auto v = fabsf(static_cast<float>(row_in[i]));
+    thread_max = fmaxf(thread_max, v);
  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
-  __shared__ float block_absmax_val;
+  using BlockReduce = cub::BlockReduce<float, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;
+  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  __shared__ float absmax;
  if (tid == 0) {
-    block_absmax_val = block_absmax_val_maybe;
-    scale[token_idx] = block_absmax_val / 127.0f;
+    absmax = block_max;
+    scale_out[blockIdx.x] = absmax / 127.f;
  }
  __syncthreads();

-  float const tmp_scale = 127.0f / block_absmax_val;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
-  }
+  float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
+
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
+      });
 }

-template <typename scalar_t, typename scale_type, typename azp_type>
+// MinMax structure to hold min and max values in one go
+struct MinMax {
+  float min, max;
+
+  __host__ __device__ MinMax()
+      : min(std::numeric_limits<float>::max()),
+        max(std::numeric_limits<float>::lowest()) {}
+
+  __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
+
+  // add a value to the MinMax
+  __host__ __device__ MinMax& operator+=(float v) {
+    min = fminf(min, v);
+    max = fmaxf(max, v);
+    return *this;
+  }
+
+  // merge two MinMax objects
+  __host__ __device__ MinMax& operator&=(const MinMax& other) {
+    min = fminf(min, other.min);
+    max = fmaxf(max, other.max);
+    return *this;
+  }
+};
+
+__host__ __device__ inline MinMax operator+(MinMax a, float v) {
+  return a += v;
+}
+__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
+  return a &= b;
+}
+
+template <typename scalar_t, typename scale_t, typename azp_t>
 __global__ void dynamic_scaled_int8_azp_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, azp_type* azp, const int hidden_size) {
-  int64_t const token_idx = blockIdx.x;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, azp_t* azp_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;

  // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;

-  // Scan for the min and max value for this token
-  float max_val = std::numeric_limits<float>::min();
-  float min_val = std::numeric_limits<float>::max();
-  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    auto val = static_cast<float>(input[i]);
-    max_val = std::max(max_val, val);
-    min_val = std::min(min_val, val);
+  // 1. calculate min & max
+  MinMax thread_mm;
+  for (int i = tid; i < hidden_size; i += stride) {
+    thread_mm += static_cast<float>(row_in[i]);
  }

-  // Reduce the max and min values across the block
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
-  __syncthreads();  // Make sure min doesn't mess with max shared memory
-  min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
+  using BlockReduce = cub::BlockReduce<MinMax, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;

-  __shared__ scale_type scale_sh;
-  __shared__ azp_type azp_sh;
+  MinMax mm = BlockReduce(tmp).Reduce(
+      thread_mm,
+      [] __device__(MinMax a, const MinMax& b) {
+        a &= b;
+        return a;
+      },
+      blockDim.x);

-  // Compute the scale and zero point and store them, only on the first thread
-  if (threadIdx.x == 0) {
-    float const scale_val = (max_val - min_val) / 255.0f;
-    // Use rounding to even (same as torch.round)
-    auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
-    auto const azp_val = static_cast<azp_type>(azp_float);
-
-    // Store the scale and azp into shared and global
-    scale[token_idx] = scale_sh = scale_val;
-    azp[token_idx] = azp_sh = azp_val;
+  __shared__ float scale_sh;
+  __shared__ azp_t azp_sh;
+  if (tid == 0) {
+    float s = (mm.max - mm.min) / 255.f;
+    float zp = nearbyintf(-128.f - mm.min / s);  // round-to-even
+    scale_sh = s;
+    azp_sh = azp_t(zp);
+    scale_out[blockIdx.x] = s;
+    azp_out[blockIdx.x] = azp_sh;
  }
-
-  // Wait for the scale and azp to be computed
  __syncthreads();

-  float const scale_val = scale_sh;
-  azp_type const azp_val = azp_sh;
+  const float inv_s = 1.f / scale_sh;
+  const azp_t azp = azp_sh;

-  // Quantize the values
-  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    auto const val = static_cast<float>(input[i]);
-    auto const quant_val =
-        int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
-    out[i] = quant_val;
-  }
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
 }

 }  // namespace vllm
@ -247,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  dim3 const block(std::min(hidden_size, 256));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@ -278,7 +316,7 @@ void dynamic_scaled_int8_quant(
  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  dim3 const block(std::min(hidden_size, 256));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                           torch::Tensor const& b,
                                           torch::Tensor const& a_scales,
                                           torch::Tensor const& b_scales) {
-  TORCH_CHECK(
-      a.size(0) % 4 == 0,
-      "Input tensor must have a number of rows that is a multiple of 4. ",
-      "but got: ", a.size(0), " rows.");
  if (out.dtype() == torch::kBFloat16) {
    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
        out, a, b, a_scales, b_scales);
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -1,5 +1,6 @@
 #pragma once

+#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"

@ -22,49 +23,49 @@ namespace vllm {

 using namespace cute;

-template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
-          class ClusterShape, typename EpilogueScheduler,
-          typename MainloopScheduler>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
 struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = cutlass::float_e4m3_t;

  using ElementA = ElementAB;
  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;

  using ElementB = ElementAB;
  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;

-  using ElementC = void;
  using ElementD = OutType;
  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;

+  using ElementC = void; // TODO: support bias
  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
  static constexpr int AlignmentC = AlignmentD;

  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBlockScale = float;

-  // MMA and Cluster Tile Shapes
-  // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
-  // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
-  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
-  static constexpr int ScaleGranularityM =
-      size<0>(MmaTileShape{}) / ScaleMsPerTile;
-  static constexpr int ScaleGranularityN =
-      size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
-  static constexpr int ScaleGranularityK =
-      size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;

-  // Shape of the threadblocks in a cluster
-  using ClusterShape_MNK = ClusterShape;
-
-  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-      ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-      cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());

@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {

  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
  using ElementScalar = float;
-  // clang-format off
  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      ArchTag,
@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
      ElementAccumulator,
      ElementCompute,
      ElementC,
-      LayoutC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
      AlignmentC,
      ElementD,
-      LayoutD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
      AlignmentD,
      EpilogueScheduler,
      DefaultOperation
  >::CollectiveOp;
 
  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-      ArchTag,
-      OperatorClass,
-      ElementA,
-      cute::tuple<LayoutA, LayoutSFA>,
-      AlignmentA,
-      ElementB,
-      cute::tuple<LayoutB, LayoutSFB>,
-      AlignmentB,
-      ElementAccumulator,
-      MmaTileShape,
-      ClusterShape,
-
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      MainloopScheduler
-  >::CollectiveOp;
-  // clang-format on
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;

  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                   torch::Tensor const& b,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
  using GemmKernel = typename Gemm::GemmKernel;
  using StrideA = typename Gemm::GemmKernel::StrideA;
  using StrideB = typename Gemm::GemmKernel::StrideB;
@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  using ElementD = typename Gemm::ElementD;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
-  auto prob_shape = cute::make_shape(m, n, k, 1);

  StrideA a_stride;
  StrideB b_stride;
@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  b_stride =
      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
  c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));

-  LayoutSFA layout_SFA =
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB =
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr,        a_stride,   b_ptr,        b_stride,
-      a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
                                               torch::Tensor const& b,
                                               torch::Tensor const& a_scales,
                                               torch::Tensor const& b_scales) {
-  auto m = a.size(0);
-  auto k = a.size(1);
-  auto n = b.size(1);
-  int sms;
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());

-  auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
-    return std::ceil(static_cast<float>(m) / tile1SM) *
-               std::ceil(static_cast<float>(n) / tile1SM) >=
-           sms;
-  };
-  bool use_2sm = should_use_2sm(m, n);
-  if (use_2sm) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
-        Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-        out, a, b, a_scales, b_scales);
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
  } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
-        Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
        out, a, b, a_scales, b_scales);
  }
 }
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -15,16 +15,59 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
+  // M in (256, inf)
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_256, _128, _64>;
+  using TileShape = Shape<_256, _128, _128>;
  using ClusterShape = Shape<_2, _2, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
 };

+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_2, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue,
          typename... EpilogueArgs>
@ -39,8 +82,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
  using Cutlass3xGemmDefault =
      typename sm100_fp8_config_default<InType, OutType,
                                        Epilogue>::Cutlass3xGemm;
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, b, std::forward<EpilogueArgs>(args)...);
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // m in (128, 256]
+    return cutlass_gemm_caller<Cutlass3xGemmM256>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
 }

 template <template <typename, typename, typename> typename Epilogue,
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
  if (n >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else if (k >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else if (m <= 16) {
    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else {
    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  }
 }

@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
  if (out_tensors.dtype() == torch::kBFloat16) {
    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else {
    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  }
 }

@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                       expert_offsets, problem_sizes, a_strides, b_strides,
-                       c_strides);
+                       c_strides, per_act_token, per_out_ch);
 }
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
  int k_size = a_tensors.size(1);
  int n_size = out_tensors.size(1);

-  bool per_act_token = a_scales.numel() != 1;
-  bool per_out_ch = b_scales.numel() != num_experts;
-
  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

  auto options_int =
--- a/Show More
+++ b/Show More