revert skip-merge-desc

Signed-off-by: Robert Shaw <robshaw@redhat.com>
updated
2025-07-03 20:30:45 +00:00 · 2025-07-03 20:29:33 +00:00 · 2025-07-03 18:29:58 +00:00
1080 changed files with 46026 additions and 71330 deletions
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done

 lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -18,14 +18,12 @@ RTOL = 0.08

 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
-    max_model_len = eval_config.get("max_model_len", 4096)
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
-        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"trust_remote_code={trust_remote_code}"
    )
    results = lm_eval.simple_evaluate(
        model="vllm",
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:

 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do

 ### Throughput test

-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.

 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.

 ### Serving test

-We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:

 ```json
 [
@ -118,8 +118,8 @@ Inside this example:

 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`

 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.

--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -100,7 +100,7 @@ if __name__ == "__main__":
            raw_result = json.loads(f.read())

        if "serving" in str(test_file):
-            # this result is generated via `vllm bench serve` command
+            # this result is generated via `benchmark_serving.py`

            # attach the benchmarking command to raw_result
            try:
@ -120,7 +120,7 @@ if __name__ == "__main__":
            continue

        elif "latency" in f.name:
-            # this result is generated via `vllm bench latency` command
+            # this result is generated via `benchmark_latency.py`

            # attach the benchmarking command to raw_result
            try:
@ -148,7 +148,7 @@ if __name__ == "__main__":
            continue

        elif "throughput" in f.name:
-            # this result is generated via `vllm bench throughput` command
+            # this result is generated via `benchmark_throughput.py`

            # attach the benchmarking command to raw_result
            try:
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
-
+    
    return
  fi
 }
@ -95,14 +95,12 @@ json2args() {
 }

 kill_gpu_processes() {
-  pkill -f '[p]ython'
-  pkill -f '[p]ython3'
-  pkill -f '[t]ritonserver'
-  pkill -f '[p]t_main_thread'
-  pkill -f '[t]ext-generation'
-  pkill -f '[l]mdeploy'
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pkill -f '[V]LLM'
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy

  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
@ -127,7 +125,7 @@ ensure_installed() {
 }

 run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
+  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases

  local serving_test_file
@ -227,7 +225,7 @@ run_serving_tests() {

      if [[ "$dataset_name" = "sharegpt" ]]; then

-        client_command="vllm bench serve \
+        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -248,7 +246,7 @@ run_serving_tests() {
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')

-        client_command="vllm bench serve \
+        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
@ -267,13 +265,13 @@ run_serving_tests() {
          $client_args"

      else
-
+  
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1

      fi

-
+        

      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
@ -304,7 +302,7 @@ run_serving_tests() {
 }

 run_genai_perf_tests() {
-  # run genai-perf tests
+  # run genai-perf tests 

  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
@ -313,14 +311,14 @@ run_genai_perf_tests() {
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
-
+    
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}

@ -371,10 +369,10 @@ run_genai_perf_tests() {
        qps=$num_prompts
        echo "now qps is $qps"
      fi
-
+    
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
-
+      
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
@ -415,7 +413,7 @@ prepare_dataset() {
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
-
+  
 }

 main() {
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -126,8 +126,7 @@ kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pgrep VLLM | xargs -r kill -9
+

  # wait until GPU memory usage smaller than 1GB
  if command -v nvidia-smi; then
@ -165,7 +164,7 @@ upload_to_buildkite() {
 }

 run_latency_tests() {
-  # run latency tests using `vllm bench latency` command
+  # run latency tests using `benchmark_latency.py`
  # $1: a json file specifying latency test cases

  local latency_test_file
@ -206,7 +205,7 @@ run_latency_tests() {
      fi
    fi

-    latency_command=" $latency_envs vllm bench latency \
+    latency_command=" $latency_envs python3 benchmark_latency.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"

@ -232,7 +231,7 @@ run_latency_tests() {
 }

 run_throughput_tests() {
-  # run throughput tests using `vllm bench throughput`
+  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases

  local throughput_test_file
@ -273,7 +272,7 @@ run_throughput_tests() {
      fi
    fi

-    throughput_command=" $throughput_envs vllm bench throughput \
+    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"

@ -298,7 +297,7 @@ run_throughput_tests() {
 }

 run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
+  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases

  local serving_test_file
@ -394,7 +393,7 @@ run_serving_tests() {

      # pass the tensor parallel size to the client so that it can be displayed
      # on the benchmark dashboard
-      client_command="vllm bench serve \
+      client_command="python3 benchmark_serving.py \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
@ -448,7 +447,7 @@ main() {
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)

-  # get the current IP address, required by `vllm bench serve` command
+  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOGGING_LEVEL="WARNING"
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -52,7 +52,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Annotate release workflow"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -107,8 +107,10 @@ fi

 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
-  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -6,16 +6,15 @@ set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-# used for TP/PP E2E test
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

 export CMAKE_BUILD_PARALLEL_LEVEL=32

 # Setup cleanup
-remove_docker_container() {
-    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
@ -25,8 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
@ -49,16 +48,10 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
-    # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
-
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
    pytest -v -s tests/models/language/pooling -m cpu_model
    pytest -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
@ -69,32 +62,39 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

-  # Note: disable it until supports V1
  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    VLLM_USE_V1=0 pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
+
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  
+
+  # online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
-
-  # online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions'
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -6,17 +6,19 @@ set -exuo pipefail

 # Try building the docker image
 cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
-FROM gaudi-base-image:latest
+FROM 1.22-413-pt2.7.1:latest

 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm

+RUN pip install -v -r requirements/hpu.txt
+RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
-RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -1,166 +0,0 @@
-#!/bin/bash
-
-set -xu
-
-
-remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
-}
-
-trap remove_docker_container EXIT
-
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-cleanup_docker() {
-  # Get Docker's root directory
-  docker_root=$(docker info -f '{{.DockerRootDir}}')
-  if [ -z "$docker_root" ]; then
-    echo "Failed to determine Docker root directory."
-    exit 1
-  fi
-  echo "Docker root directory: $docker_root"
-  # Check disk usage of the filesystem where Docker's root directory is located
-  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
-  # Define the threshold
-  threshold=70
-  if [ "$disk_usage" -gt "$threshold" ]; then
-    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
-    # Remove dangling images (those that are not tagged and not used by any container)
-    docker image prune -f
-    # Remove unused volumes / force the system prune for old images as well.
-    docker volume prune -f && docker system prune --force --filter "until=72h" --all
-    echo "Docker images and volumes cleanup completed."
-  else
-    echo "Disk usage is below $threshold%. No cleanup needed."
-  fi
-}
-cleanup_docker
-
-# For HF_TOKEN.
-source /etc/environment
-
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c '
-set -e # Exit immediately if a command exits with a non-zero status.
-set -u # Treat unset variables as an error.
-
-echo "--- Starting script inside Docker container ---"
-
-# Create results directory
-RESULTS_DIR=$(mktemp -d)
-# If mktemp fails, set -e will cause the script to exit.
-echo "Results will be stored in: $RESULTS_DIR"
-
-# Install dependencies
-echo "--- Installing Python dependencies ---"
-python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
-    && python3 -m pip install --progress-bar off hf-transfer
-echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
-export VLLM_XLA_CHECK_RECOMPILATION=1
-export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
-
-echo "--- Hardware Information ---"
-# tpu-info
-echo "--- Starting Tests ---"
-set +e
-overall_script_exit_code=0
-
-# --- Test Definitions ---
-# If a test fails, this function will print logs and will not cause the main script to exit.
-run_test() {
-    local test_num=$1
-    local test_name=$2
-    local test_command=$3
-    local log_file="$RESULTS_DIR/test_${test_num}.log"
-    local actual_exit_code
-
-    echo "--- TEST_$test_num: Running $test_name ---"
-    
-    # Execute the test command.
-    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
-    actual_exit_code=$?
-
-    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
-    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
-
-    if [ "$actual_exit_code" -ne 0 ]; then
-        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
-        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
-        if [ -f "$log_file" ]; then
-            cat "$log_file" >&2
-        else
-            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
-        fi
-        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
-        return "$actual_exit_code" # Return the failure code
-    else
-        echo "TEST_$test_num ($test_name) PASSED."
-        return 0 # Return success
-    fi
-}
-
-# Helper function to call run_test and update the overall script exit code
-run_and_track_test() {
-    local test_num_arg="$1"
-    local test_name_arg="$2"
-    local test_command_arg="$3"
-
-    # Run the test
-    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
-    local test_specific_exit_code=$?
-
-    # If the test failed, set the overall script exit code to 1
-    if [ "$test_specific_exit_code" -ne 0 ]; then
-        # No need for extra echo here, run_test already logged the failure.
-        overall_script_exit_code=1
-    fi
-}
-
-# --- Actual Test Execution ---
-run_and_track_test 1 "test_struct_output_generate.py" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
-run_and_track_test 2 "test_moe_pallas.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-run_and_track_test 3 "test_lora.py" \
-    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 4 "test_tpu_qkv_linear.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 5 "test_spmd_model_weight_loading.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
-run_and_track_test 6 "test_kv_cache_update_kernel.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
-
-# After all tests have been attempted, exit with the overall status.
-if [ "$overall_script_exit_code" -ne 0 ]; then
-    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
-else
-    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
-fi
-exit "$overall_script_exit_code"
-' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
-
-# Capture the exit code of the docker run command
-DOCKER_RUN_EXIT_CODE=$?
-
-# The trap will run for cleanup.
-# Exit the main script with the Docker run command's exit code.
-if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
-    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
-    exit "$DOCKER_RUN_EXIT_CODE"
-else
-    echo "Docker run command completed successfully."
-    exit 0
-fi
-# TODO: This test fails because it uses RANDOM_SEED sampling
-# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,8 +62,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
-    && python3 -m pip install --progress-bar off hf-transfer
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
@ -71,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"

 echo "--- Hardware Information ---"
-# tpu-info
+tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
@ -135,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
 run_and_track_test 2 "test_basic.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 run_and_track_test 5 "examples/offline_inference/tpu.py" \
@ -150,6 +149,18 @@ run_and_track_test 9 "test_multimodal.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
+run_and_track_test 11 "test_struct_output_generate.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 12 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 16 "test_kv_cache_update_kernel.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"

 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
 docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
-remove_docker_container() {
-  docker rm -f "${container_name}" || true;
+remove_docker_container() { 
+  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
@ -26,18 +26,7 @@ docker run \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    cd tests
-    pytest -v -s v1/core
-    pytest -v -s v1/engine
-    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
-    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
-    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
-    pytest -v -s v1/test_serial_utils.py
-    pytest -v -s v1/test_utils.py
-    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

 # run python-based benchmarks and upload the result to buildkite
-vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?

-vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r

 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-vllm bench serve \
+python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -22,6 +22,16 @@ trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container

+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
 LOG_ROOT=$(mktemp -d)
 # If mktemp fails, set -e will cause the script to exit.
 echo "Results will be stored in: $LOG_ROOT"
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -77,7 +77,7 @@ done
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
-vllm bench serve \
+python benchmarks/benchmark_serving.py \
    --backend vllm \
    --model $MODEL  \
    --dataset-name sonnet \
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -117,7 +117,7 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Test # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -125,6 +125,8 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -133,21 +135,9 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-
- label: Entrypoints Test (API Server) # 40min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/openai
-  - tests/entrypoints/test_chat_utils
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  mirror_hardwares: [amdexperimental]
@ -159,14 +149,13 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
+  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
-  - tests/v1/test_internal_lb_dp.py
-  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
  commands:
  # test with tp=2 and external_dp=2
@ -178,13 +167,12 @@ steps:
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -229,7 +217,7 @@ steps:
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -268,7 +256,6 @@ steps:
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/kv_connector/unit
-    - pytest -v -s v1/metrics
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
@ -277,11 +264,11 @@ steps:
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

 - label: Examples Test # 25min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
@ -295,7 +282,7 @@ steps:
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
@ -315,7 +302,7 @@ steps:


 - label: Platform Tests (CUDA)
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/cuda
@ -333,9 +320,20 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LoRA Test %N # 15min each
+- label: Speculative decoding tests # 40min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
+  - vllm/spec_decode
+  - tests/spec_decode
+  - vllm/model_executor/models/eagle.py
+  commands:
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
+    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
+
+- label: LoRA Test %N # 15min each
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
  - vllm/lora
  - tests/lora
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
@ -386,7 +384,7 @@ steps:
    - pytest -v -s kernels/core

 - label: Kernels Attention Test %N
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
@ -397,7 +395,7 @@ steps:
  parallelism: 2

 - label: Kernels Quantization Test %N
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
@ -416,7 +414,7 @@ steps:
    - pytest -v -s kernels/moe

 - label: Kernels Mamba Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
@ -424,7 +422,7 @@ steps:
    - pytest -v -s kernels/mamba

 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -438,6 +436,7 @@ steps:

 - label: Model Executor Test
  mirror_hardwares: [amdexperimental, amdproduction]
+  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor
  - tests/model_executor
@ -494,7 +493,7 @@ steps:
  - pytest -s entrypoints/openai/correctness/

 - label: Encoder Decoder tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
@ -502,7 +501,7 @@ steps:
    - pytest -v -s encoder_decoder

 - label: OpenAI-Compatible Tool Use # 20 min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
    - vllm/
@ -614,7 +613,7 @@ steps:
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

 - label: Quantized Models Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
@ -631,18 +630,6 @@ steps:
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

- label: Transformers Nightly Models Test
-  working_dir: "/vllm-workspace/"
-  optional: true
-  commands:
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/audio_language.py --model-type whisper
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@ -717,10 +704,10 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
+  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -1,6 +0,0 @@
-# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
-have_fun: false  # Just review the code
-code_review:
-  comment_severity_threshold: HIGH  # Reduce quantity of comments
-  pull_request_opened:
-    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,8 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
-CMakeLists.txt @tlrmchlsmth @LucasWilkinson
+CMakeLists.txt @tlrmchlsmth

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
@ -43,6 +42,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -52,15 +52,3 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor
-
-# CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
-/csrc/cpu @bigPYJ1151
-/vllm/platforms/cpu.py @bigPYJ1151
-/cmake/cpu_extension.cmake @bigPYJ1151
-/docker/Dockerfile.cpu @bigPYJ1151
-
-# Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
-/vllm/platforms/xpu.py @jikunshang
-/docker/Dockerfile.xpu @jikunshang
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -46,7 +46,7 @@ body:
 - type: markdown
  attributes:
    value: >
-      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
+      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -86,6 +86,8 @@ pull_request_rules:
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
+      - files=tests/models/registry.py
+      - files=docs/models/supported_models.md
  actions:
    label:
      add:
@ -164,7 +166,10 @@ pull_request_rules:
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
+      - files~=^vllm/spec_decode/
      - files~=^vllm/v1/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+      - files~=^tests/spec_decode/
      - files~=^tests/v1/spec_decode/
      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
      - files~=^vllm/model_executor/models/.*eagle.*\.py
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -7,7 +7,7 @@ permissions:

 jobs:
  lint-and-deploy:
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"

      - name: curl test
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -146,7 +146,6 @@ venv.bak/

 # mkdocs documentation
 /site
-docs/argparse
 docs/examples

 # mypy
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.32.0
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
@ -166,11 +166,11 @@ repos:
    language: python
    types: [python]
    pass_filenames: true
-    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+    files: vllm/config.py|tests/test_config.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
-    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
    verbose: true
    pass_filenames: false
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
@ -171,6 +171,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+
 #
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -231,6 +232,7 @@ endif()

 set(VLLM_EXT_SRC
  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/mamba/causal_conv1d/causal_conv1d.cu"
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
@ -257,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -296,8 +298,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu"
-    "csrc/quantization/fp8/per_token_group_quant.cu")
+    "csrc/attention/mla/cutlass_mla_entry.cu")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -392,7 +393,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.0 or later
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@ -408,7 +409,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
@ -423,7 +424,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
@ -437,7 +438,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
                     "later if you intend on running FP8 quantized models on "
@ -452,7 +453,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@ -467,7 +468,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
                     "later if you intend on running FP8 quantized models on "
@ -510,7 +511,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
  # require CUDA 12.2 or later (and only work on Hopper).
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -519,7 +520,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
@ -531,7 +532,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # FP4 Archs and flags
  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
@ -552,10 +553,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # CUTLASS MLA Archs and flags
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu"
-      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
+      "csrc/attention/mla/cutlass_mla_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${MLA_ARCHS}")
@ -578,7 +578,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -596,26 +596,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures.")
-    endif()
-  endif()
-
  # moe_data.cu is used by all CUTLASS MoE kernels.
  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
@ -636,33 +616,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
-
  #
  # Machete kernels

  # The machete kernels only work on hopper and require CUDA 12.0 or later.
  # Only build Machete kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
@ -714,7 +674,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@ -768,14 +728,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
 endif()

-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  set(MOE_PERMUTE_SRC
-      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
-      "csrc/moe/moe_permute_unpermute_op.cu")
-
-  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
-endif()
-
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_MOE_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
@ -844,6 +796,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
 endif()

+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
--- a/README.md
+++ b/README.md
@ -63,11 +63,13 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill

+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+
 vLLM is flexible and easy to use with:

 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
--- a/RELEASE.md
+++ b/RELEASE.md
@ -52,36 +52,3 @@ After branch cut, we approach finalizing the release branch with clear criteria
 * Release branch specific changes (e.g. change version identifiers or CI fixes)

 Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
-
-## Manual validations
-
-### E2E Performance Validation
-
-Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
-
-**Current Coverage:**
-* Models: Llama3, Llama4, and Mixtral
-* Hardware: NVIDIA H100 and AMD MI300x
-* *Note: Coverage may change based on new model releases and hardware availability*
-
-**Performance Validation Process:**
-
-**Step 1: Get Access**
-Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
-
-**Step 2: Review Benchmark Setup**
-Familiarize yourself with the benchmark configurations:
-* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
-* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
-
-**Step 3: Run the Benchmark**
-Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
-* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
-* **vLLM commit**: Set to the RC commit hash
-
-**Step 4: Review Results**
-Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
-
-**Step 5: Performance Comparison**
-Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
-example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -98,7 +98,7 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
@ -111,25 +111,25 @@ If successful, you will see the following output

 ```
 ============ Serving Benchmark Result ============
-Successful requests:                     10
-Benchmark duration (s):                  5.78
-Total input tokens:                      1369
-Total generated tokens:                  2212
-Request throughput (req/s):              1.73
-Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
+Successful requests:                     10        
+Benchmark duration (s):                  5.78      
+Total input tokens:                      1369      
+Total generated tokens:                  2212      
+Request throughput (req/s):              1.73      
+Output token throughput (tok/s):         382.89    
+Total Token throughput (tok/s):          619.85    
 ---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54
-Median TTFT (ms):                        73.88
-P99 TTFT (ms):                           79.49
+Mean TTFT (ms):                          71.54     
+Median TTFT (ms):                        73.88     
+P99 TTFT (ms):                           79.49     
 -----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91
-Median TPOT (ms):                        7.96
-P99 TPOT (ms):                           8.03
+Mean TPOT (ms):                          7.91      
+Median TPOT (ms):                        7.96      
+P99 TPOT (ms):                           8.03      
 ---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74
-Median ITL (ms):                         7.70
-P99 ITL (ms):                            8.39
+Mean ITL (ms):                           7.74      
+Median ITL (ms):                         7.70      
+P99 ITL (ms):                            8.39      
 ==================================================
 ```

@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
-```
+``` 

 ```bash
 # start server
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests

 ```bash
 # run benchmarking script
-vllm bench serve --port 9001 --save-result --save-detailed \
+python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
  --backend vllm \
  --model meta-llama/Llama-3.1-8B-Instruct \
  --endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 ```bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 ```

 ``` bash
-vllm bench serve \
+python3 benchmarks/benchmark_serving.py \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name hf \
    --dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 **`lmms-lab/LLaVA-OneVision-Data`**

 ```bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ vllm bench serve \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**

 ```bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ vllm bench serve \
 **`AI-MO/aimo-validation-aime`**

 ``` bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ vllm bench serve \
 **`philschmid/mt-bench`**

 ``` bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:

 ```bash
-vllm bench serve \
+python3 vllm/benchmarks/benchmark_serving.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
 <br/>

 ```bash
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset-name sonnet \
  --dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens:  1500
 **VisionArena Benchmark for Vision Language Models**

 ``` bash
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens:  1280
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
    --dataset-name=hf \
    --dataset-path=likaixin/InstructCoder \
    --model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens:  204800
 **`lmms-lab/LLaVA-OneVision-Data`**

 ```bash
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -373,7 +373,7 @@ vllm bench throughput \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**

 ```bash
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
@ -385,7 +385,7 @@ vllm bench throughput \
 **`AI-MO/aimo-validation-aime`**

 ```bash
-vllm bench throughput \
+python3 benchmarks/benchmark_throughput.py \
  --model Qwen/QwQ-32B \
  --backend vllm \
  --dataset-name hf \
@ -399,7 +399,7 @@ vllm bench throughput \
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench throughput \
+python3 vllm/benchmarks/benchmark_throughput.py \
  --model meta-llama/Llama-2-7b-hf \
  --backend vllm \
  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -1,18 +1,45 @@
 #!/bin/bash

-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
-# See details in README (benchmarks/auto_tune/README.md).
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
+# It also supports additional requirement: e2e latency and prefix cache. 
+
+# Pre-requisite:
+# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
+# 2. If the model is customized, replace the MODEL's config with the customized config.
+# 3. Set variables (ALL REQUIRED)
+#   BASE: your directory for vllm repo
+#   MODEL: the model served by vllm
+#   SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
+#   TP: ways of tensor parallelism
+#   DOWNLOAD_DIR: directory to download and load model weights.
+#   INPUT_LEN: request input len
+#   OUTPUT_LEN: request output len
+#   MIN_CACHE_HIT_PCT: prefix cache rate
+#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
+#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
+#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
+# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
+# 5. The final result will be saved in RESULT file. 
+
+
+# Example use cases 
+# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
+# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
+# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500

 TAG=$(date +"%Y_%m_%d_%H_%M")
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
+BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
 SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
 NUM_SEQS_LIST="128 256"
@ -38,13 +65,6 @@ current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
 echo "current_hash: $current_hash"

-TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
-RED='\033[0;31m'
-if (( TOTAL_LEN > MAX_MODEL_LEN )); then
-    echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
-    exit 1
-fi
-
 best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
@ -56,7 +76,7 @@ start_server() {
    local max_num_batched_tokens=$3
    local vllm_log=$4
    local profile_dir=$5
-
+    
    pkill -f vllm

    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@ -69,13 +89,13 @@ start_server() {
        --enable-prefix-caching \
        --load-format dummy \
        --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &

    # wait for 10 minutes...
    server_started=0
-    for i in {1..60}; do
+    for i in {1..60}; do  
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
        if [[ "$STATUS_CODE" -eq 200 ]]; then
            server_started=1
            break
@ -98,10 +118,10 @@ update_best_profile() {
    selected_profile_file=
    if [[ "$SYSTEM" == "TPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi
+    fi 
    if [[ "$SYSTEM" == "GPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}"
-    fi
+    fi 
    rm -f $PROFILE_PATH/*
    cp $selected_profile_file $PROFILE_PATH
 }
@ -129,18 +149,17 @@ run_benchmark() {
        echo "server started."
    fi
    echo
-
+    
    echo "run benchmark test..."
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
-adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    vllm bench serve \
+    python benchmarks/benchmark_serving.py \
        --backend vllm \
        --model $MODEL  \
        --dataset-name random \
-        --random-input-len $adjusted_input_len \
+        --random-input-len $INPUT_LEN \
        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
@ -169,11 +188,11 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            vllm bench serve \
+            python benchmarks/benchmark_serving.py \
                --backend vllm \
                --model $MODEL  \
                --dataset-name random \
-                --random-input-len $adjusted_input_len \
+                --random-input-len $INPUT_LEN \
                --random-output-len $OUTPUT_LEN \
                --ignore-eos \
                --disable-tqdm \
@ -254,3 +273,4 @@ done
 echo "finish permutations"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
+
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -1,141 +0,0 @@
-# Automated vLLM Server Parameter Tuning
-
-This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
-
-## Table of Contents
- [Prerequisites](#prerequisites)
- [Configuration](#configuration)
- [How to Run](#how-to-run)
- [Example Use Cases](#example-use-cases)
- [Output](#output)
- [How It Works](#how-it-works)
-
-## Prerequisites
-
-Before running the script, please ensure the following steps are completed:
-
-1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-# git checkout <your-branch>
-```
-
-1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
-
-2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
-
-## Configuration
-
-You must set the following variables at the top of the script before execution.
-
-| Variable | Description | Example Value |
-| --- | --- | --- |
-| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
-| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
-| `TP` | **Required.** The tensor-parallelism size. | `1` |
-| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
-| `INPUT_LEN` | **Required.** Request input length. | `4000` |
-| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
-| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
-| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
-| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
-| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
-| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
-
-**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
-
-## How to Run
-
-1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
-2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
-
-```
-cd <FOLDER_OF_THIS_SCRIPT>
-bash auto_tune.sh
-```
-
-    Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
-
-## Example Use Cases
-
-Here are a few examples of how to configure the script for different goals:
-
-### 1. Maximize Throughput (No Latency Constraint)
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
- **Configuration**:
-
-```bash
-INPUT_LEN=1800
-OUTPUT_LEN=20
-MAX_MODEL_LEN=2048
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
-```
-
-#### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
-
-```bash
-INPUT_LEN=1800
-OUTPUT_LEN=20
-MAX_MODEL_LEN=2048
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=500
-```
-
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
-
-```bash
-INPUT_LEN=1800
-OUTPUT_LEN=20
-MAX_MODEL_LEN=2048
-MIN_CACHE_HIT_PCT=60
-MAX_LATENCY_ALLOWED_MS=500
-```
-
-## Output
-
-After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
-
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
-    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
-    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
-
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
-
-```
-# Example result.txt content
-hash:a1b2c3d4...
-max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
-max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
-...
-best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
-```
-
-  If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
-
- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
-
-## How It Works
-
-The script follows a systematic process to find the optimal parameters:
-
-1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
-
-2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
-
-3. **Latency-Aware Throughput Search**: For each parameter combination:
-    - The vLLM server is started.
-    - A benchmark is first run with an infinite request rate (`--request-rate inf`).
-    - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
-    - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
-
-4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
-
-5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -324,9 +324,6 @@ class RandomDataset(BenchmarkDataset):
        input_low = int(real_input_len * (1 - range_ratio))
        input_high = int(real_input_len * (1 + range_ratio))
        output_low = int(output_len * (1 - range_ratio))
-        # Ensure the lower bound for output length is at least 1 to prevent
-        # sampling 0 tokens, which can cause request failures.
-        output_low = max(output_low, 1)
        output_high = int(output_len * (1 + range_ratio))

        # Add logging for debugging
@ -704,7 +701,6 @@ class HuggingFaceDataset(BenchmarkDataset):
        self,
        dataset_path: str,
        dataset_split: str,
-        no_stream: bool = False,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
@ -712,7 +708,6 @@ class HuggingFaceDataset(BenchmarkDataset):

        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
-        self.load_stream = not no_stream
        self.load_data()

    def load_data(self) -> None:
@ -721,7 +716,7 @@ class HuggingFaceDataset(BenchmarkDataset):
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
-            streaming=self.load_stream,
+            streaming=True,
        )
        self.data = self.data.shuffle(seed=self.random_seed)

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -11,7 +11,6 @@ from typing import Any, Optional

 import numpy as np
 from tqdm import tqdm
-from typing_extensions import deprecated

 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -35,10 +34,6 @@ def save_to_pytorch_benchmark_format(
        write_to_json(pt_file, pt_records)


-@deprecated(
-    "benchmark_latency.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench latency' instead.",
-)
 def main(args: argparse.Namespace):
    print(args)

--- a/benchmarks/benchmark_one_concurrent.py
+++ b/benchmarks/benchmark_one_concurrent.py
@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import asyncio
+import logging
+import random
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import aiohttp  # Import aiohttp
+import numpy as np
+from tqdm import tqdm
+
+from backend_request_func import RequestFuncInput, RequestFuncOutput
+from benchmark_dataset import RandomDataset, SampleRequest
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+async def reset_cache(reset_url: str):
+    """Sends a POST request to reset the prefix cache."""
+    logger.debug("Resetting prefix cache at %s", reset_url)
+    try:
+        async with (
+            aiohttp.ClientSession() as session,
+            session.post(reset_url) as response,
+        ):
+            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+            logger.debug("Prefix cache reset successful: %s", response.status)
+    except aiohttp.ClientConnectorError as e:
+        logger.error("Failed to connect to cache reset endpoint %s: %s}", reset_url, e)
+    except aiohttp.ClientResponseError as e:
+        logger.error(
+            "Cache reset request failed with status %s: %s", e.status, e.message
+        )
+    except Exception as e:
+        logger.error("An unexpected error occurred during cache reset: %s", e)
+
+
+async def sequential_benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer,
+    input_requests: list[SampleRequest],
+    request_func,
+    selected_percentiles: list[float],
+    cache_reset_url: Optional[str] = None,
+):
+    """
+    Benchmark that processes requests sequentially, waiting for each to complete
+    before starting the next one. Resets prefix cache between requests.
+    """
+    outputs = []
+
+    pbar = tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+
+    # Process requests sequentially
+    for request in input_requests:
+        prompt, prompt_len, output_len = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+        )
+
+        logger.info("Sending request with len %s", request.prompt_len)
+        logger.debug('Request str: "%s"', request.prompt[:50])
+        request_start_time = time.perf_counter()
+
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+        )
+
+        output = await request_func(request_func_input=request_func_input)
+
+        request_end_time = time.perf_counter()
+        # Add timing information
+        if output.success and not hasattr(output, "latency"):
+            output.latency = request_end_time - request_start_time
+        logger.info("Finished request with latency %.4f s", output.latency)
+
+        outputs.append(output)
+        pbar.update(1)
+
+    pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    # Calculate metrics
+    metrics = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentiles=selected_percentiles,
+    )
+
+    print_results(metrics, benchmark_duration)
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "input_lens": [request.prompt_len for request in input_requests],
+        "output_lens": [
+            output.output_tokens if output.success else 0 for output in outputs
+        ],
+        "ttfts": [output.ttft for output in outputs if output.success],
+        "itls": [output.itl for output in outputs if output.success],
+        "generated_texts": [
+            output.generated_text for output in outputs if output.success
+        ],
+        "errors": [output.error for output in outputs if not output.success],
+    }
+
+    # Add summary statistics
+    for stat_name in ["ttft", "itl", "e2el"]:
+        for metric_name in ["mean", "median", "std"]:
+            result[f"{metric_name}_{stat_name}_ms"] = getattr(
+                metrics, f"{metric_name}_{stat_name}_ms"
+            )
+
+        for p, value in getattr(metrics, f"percentiles_{stat_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            result[f"p{p_word}_{stat_name}_ms"] = value
+
+    return result
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer,
+    selected_percentiles: list[float],
+) -> BenchmarkMetrics:
+    """Calculate benchmark metrics from results."""
+    total_input = 0
+    completed = 0
+    total_output = 0
+    ttfts = []
+    itls = []
+    e2els = []
+
+    for i, output in enumerate(outputs):
+        if output.success:
+            output_len = output.output_tokens
+
+            if not output_len:
+                # Use tokenizer to count output tokens if not provided
+                output_len = len(
+                    tokenizer(output.generated_text, add_special_tokens=False).input_ids
+                )
+
+            total_output += output_len
+            total_input += input_requests[i].prompt_len
+
+            if hasattr(output, "ttft") and output.ttft is not None:
+                ttfts.append(output.ttft)
+
+            if hasattr(output, "itl") and output.itl:
+                # Ensure itl is a list of floats
+                if isinstance(output.itl, list):
+                    itls.extend(output.itl)
+                else:
+                    logger.warning(
+                        "Expected list for ITL but got %s. Appending as is.",
+                        type(output.itl),
+                    )
+                    itls.append(output.itl)
+
+            if hasattr(output, "latency") and output.latency is not None:
+                e2els.append(output.latency)
+
+            completed += 1
+
+    return BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=total_output,
+        mean_ttft_ms=np.mean(ttfts or [0]) * 1000,
+        median_ttft_ms=np.median(ttfts or [0]) * 1000,
+        std_ttft_ms=np.std(ttfts or [0]) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or [0], p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or [0]) * 1000,
+        median_itl_ms=np.median(itls or [0]) * 1000,
+        std_itl_ms=np.std(itls or [0]) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or [0], p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or [0]) * 1000,
+        median_e2el_ms=np.median(e2els or [0]) * 1000,
+        std_e2el_ms=np.std(e2els or [0]) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or [0], p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+
+def print_results(metrics: BenchmarkMetrics, benchmark_duration: float):
+    """Print benchmark results in a formatted way."""
+    print("{s:{c}^{n}}".format(s=" Sequential Benchmark Result ", n=60, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+
+    def print_metric_stats(metric_name, header):
+        print("{s:{c}^{n}}".format(s=header, n=60, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_name.lower()}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_name.lower()}_ms"),
+            )
+        )
+
+        for p, value in getattr(metrics, f"percentiles_{metric_name.lower()}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+
+    print_metric_stats("TTFT", "Time to First Token")
+    print_metric_stats("ITL", "Inter-token Latency")
+    print_metric_stats("E2EL", "End-to-end Latency")
+    print("=" * 60)
+
+
+async def main_async(args):
+    # Import needed functions based on your setup
+    from backend_request_func import ASYNC_REQUEST_FUNCS
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    # Set up API URL
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+
+    # Set up Cache Reset URL
+    cache_reset_url = f"http://{args.host}:{args.port}/reset_prefix_cache"
+    logger.info("Prefix cache reset configured at: %s", cache_reset_url)
+
+    # Get tokenizer
+    tokenizer = get_tokenizer(tokenizer_id, trust_remote_code=args.trust_remote_code)
+
+    # Get request function
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    input_requests = RandomDataset().sample(
+        tokenizer=tokenizer,
+        num_requests=args.num_requests,
+        prefix_len=0,
+        input_len=args.input_len,
+        output_len=args.output_len,
+        range_ratio=0.0,
+    )
+
+    # Run benchmark
+    result = await sequential_benchmark(
+        backend=backend,
+        api_url=api_url,
+        model_id=model_id,
+        tokenizer=tokenizer,
+        input_requests=input_requests,
+        request_func=request_func,
+        selected_percentiles=[50, 90, 95, 99],
+        cache_reset_url=cache_reset_url,
+    )
+
+    return result
+
+
+def main(args):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    asyncio.run(main_async(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sequential benchmark for LLM serving")
+    parser.add_argument(
+        "--backend", type=str, default="vllm", help="Backend to use for requests"
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server base URL (overrides --host and --port)",
+    )
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint", type=str, default="/v1/completions", help="API endpoint"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--tokenizer", type=str, help="Name of the tokenizer (defaults to model name)"
+    )
+    parser.add_argument(
+        "--num-requests", type=int, default=100, help="Number of requests to process"
+    )
+    parser.add_argument(
+        "--input-len", type=int, default=128, help="Input len for generated prompts"
+    )
+    parser.add_argument(
+        "--output-len", type=int, default=None, help="Override output len for requests"
+    )
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from HuggingFace",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -30,7 +30,7 @@ import os
 import random
 import time
 import warnings
-from collections.abc import Iterable
+from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Literal, Optional
@ -38,7 +38,6 @@ from typing import Any, Literal, Optional
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated

 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
@ -74,7 +73,6 @@ from benchmark_dataset import (
    VisionArenaDataset,
 )
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000

@ -109,6 +107,101 @@ class BenchmarkMetrics:
    percentiles_e2el_ms: list[tuple[float, float]]


+def _get_current_request_rate(
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
+    ramp_up_start_rps: Optional[int],
+    ramp_up_end_rps: Optional[int],
+    request_index: int,
+    total_requests: int,
+    request_rate: float,
+) -> float:
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
+        progress = request_index / max(total_requests - 1, 1)
+        if ramp_up_strategy == "linear":
+            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
+            return ramp_up_start_rps + increase
+        elif ramp_up_strategy == "exponential":
+            ratio = ramp_up_end_rps / ramp_up_start_rps
+            return ramp_up_start_rps * (ratio**progress)
+        else:
+            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
+    return request_rate
+
+
+async def get_request(
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+) -> AsyncGenerator[tuple[SampleRequest, float], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a SampleRequest.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+         ramp_up_strategy (optional):
+            The ramp-up strategy. Can be "linear" or "exponential".
+            If None, uses constant request rate (specified by request_rate).
+        ramp_up_start_rps (optional):
+            The starting request rate for ramp-up.
+        ramp_up_end_rps (optional):
+            The ending request rate for ramp-up.
+    """
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
+    # Convert to list to get length for ramp-up calculations
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
+        input_requests = list(input_requests)
+
+    total_requests = len(input_requests)
+    request_index = 0
+
+    for request in input_requests:
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )
+
+        yield request, current_request_rate
+
+        request_index += 1
+
+        if current_request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        theta = 1.0 / (current_request_rate * burstiness)
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
 def calculate_metrics(
    input_requests: list[SampleRequest],
    outputs: list[RequestFuncOutput],
@ -396,6 +489,20 @@ async def benchmark(
        tasks.append(asyncio.create_task(task))
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
    if pbar is not None:
        pbar.close()

@ -504,20 +611,6 @@ async def benchmark(

    print("=" * 50)

-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
    return result


@ -594,10 +687,6 @@ def save_to_pytorch_benchmark_format(
        write_to_json(pt_file, pt_records)


-@deprecated(
-    "benchmark_serving.py is deprecated and will be removed in a future "
-    "version. Please use 'vllm bench serve' instead.",
-)
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
@ -736,7 +825,6 @@ def main(args: argparse.Namespace):
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
            random_seed=args.seed,
-            no_stream=args.no_stream,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
@ -945,11 +1033,6 @@ def create_argument_parser():
        help="Path to the sharegpt/sonnet dataset. "
        "Or the huggingface dataset ID if using HF dataset.",
    )
-    parser.add_argument(
-        "--no-stream",
-        action="store_true",
-        help="Do not load the dataset in streaming mode.",
-    )
    parser.add_argument(
        "--max-concurrency",
        type=int,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -538,6 +538,20 @@ async def benchmark(
        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
    if pbar is not None:
        pbar.close()

@ -652,20 +666,6 @@ async def benchmark(

    print("=" * 50)

-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_request.prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_request.prompt_len,
-            output_len=test_request.expected_output_len,
-            extra_body={test_request.structure_type: test_request.schema},
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
    return result, ret


--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -15,7 +15,6 @@ import torch
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated

 from benchmark_dataset import (
    AIMODataset,
@ -168,8 +167,7 @@ async def run_vllm_async(
    from vllm import SamplingParams

    async with build_async_engine_client_from_engine_args(
-        engine_args,
-        disable_frontend_multiprocessing=disable_frontend_multiprocessing,
+        engine_args, disable_frontend_multiprocessing
    ) as llm:
        model_config = await llm.get_model_config()
        assert all(
@ -358,7 +356,6 @@ def get_requests(args, tokenizer):
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
-        common_kwargs["no_stream"] = args.no_stream
        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = VisionArenaDataset
            common_kwargs["dataset_subset"] = None
@ -383,10 +380,6 @@ def get_requests(args, tokenizer):
    return dataset_cls(**common_kwargs).sample(**sample_kwargs)


-@deprecated(
-    "benchmark_throughput.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench throughput' instead.",
-)
 def main(args: argparse.Namespace):
    if args.seed is None:
        args.seed = 0
@ -617,11 +610,6 @@ def create_argument_parser():
        help="Name of the dataset to benchmark on.",
        default="sharegpt",
    )
-    parser.add_argument(
-        "--no-stream",
-        action="store_true",
-        help="Do not load the dataset in streaming mode.",
-    )
    parser.add_argument(
        "--dataset",
        type=str,
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -3,7 +3,7 @@
 # benchmark the overhead of disaggregated prefill.
 # methodology:
 # - send all request to prefill vLLM instance. It will buffer KV cache.
-# - then send all request to decode instance.
+# - then send all request to decode instance. 
 # - The TTFT of decode instance is the overhead.

 set -ex
@ -12,8 +12,6 @@ kill_gpu_processes() {
  # kill all processes on GPU.
  pgrep pt_main_thread | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pgrep VLLM | xargs -r kill -9
  sleep 10

  # remove vllm config file
@ -63,7 +61,7 @@ benchmark() {
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
+    

  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -78,38 +76,38 @@ benchmark() {
  wait_for_server 8200

  # let the prefill instance finish prefill
-  vllm bench serve \
-    --backend vllm \
-    --model $model \
-    --dataset-name $dataset_name \
-    --dataset-path $dataset_path \
-    --sonnet-input-len $input_len \
-    --sonnet-output-len "$output_len" \
-    --sonnet-prefix-len $prefix_len \
-    --num-prompts $num_prompts \
-    --port 8100 \
-    --save-result \
-    --result-dir $results_folder \
-    --result-filename disagg_prefill_tp1.json \
-    --request-rate "inf"
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_tp1.json \
+          --request-rate "inf"


  # send the request to decode.
  # The TTFT of this command will be the overhead of disagg prefill impl.
-  vllm bench serve \
-    --backend vllm \
-    --model $model \
-    --dataset-name $dataset_name \
-    --dataset-path $dataset_path \
-    --sonnet-input-len $input_len \
-    --sonnet-output-len "$output_len" \
-    --sonnet-prefix-len $prefix_len \
-    --num-prompts $num_prompts \
-    --port 8200 \
-    --save-result \
-    --result-dir $results_folder \
-    --result-filename disagg_prefill_tp1_overhead.json \
-    --request-rate "$qps"
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_tp1_overhead.json \
+          --request-rate "$qps"
  kill_gpu_processes

 }
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -18,8 +18,6 @@ kill_gpu_processes() {
  # kill all processes on GPU.
  pgrep pt_main_thread | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pgrep VLLM | xargs -r kill -9
  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
  sleep 1
 }
@ -60,7 +58,7 @@ launch_chunked_prefill() {


 launch_disagg_prefill() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
  # disagg prefill
  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -99,20 +97,20 @@ benchmark() {
  output_len=$2
  tag=$3

-  vllm bench serve \
-    --backend vllm \
-    --model $model \
-    --dataset-name $dataset_name \
-    --dataset-path $dataset_path \
-    --sonnet-input-len $input_len \
-    --sonnet-output-len "$output_len" \
-    --sonnet-prefix-len $prefix_len \
-    --num-prompts $num_prompts \
-    --port 8000 \
-    --save-result \
-    --result-dir $results_folder \
-    --result-filename "$tag"-qps-"$qps".json \
-    --request-rate "$qps"
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename "$tag"-qps-"$qps".json \
+          --request-rate "$qps"

  sleep 2
 }
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -1,141 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import copy
-import itertools
-
-import torch
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-from vllm.scalar_type import scalar_types
-from vllm.triton_utils import triton
-
-if not current_platform.has_device_capability(100):
-    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
-
-
-FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
-FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
-
-PROVIDER_CFGS = {
-    "torch-bf16": dict(enabled=True),
-    "nvfp4": dict(no_a_quant=False, enabled=True),
-    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
-}
-
-_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-
-
-def _quant_weight_nvfp4(b: torch.Tensor, device: str):
-    # Compute global scale for weight
-    b_amax = torch.abs(b).max().to(torch.float32)
-    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
-    return b_fp4, scale_b_fp4, b_global_scale
-
-
-def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
-
-    # Compute global scale for activation
-    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
-    a_amax = torch.abs(a).max().to(torch.float32)
-    a_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
-
-    # Alpha for the GEMM operation
-    alpha = 1.0 / (a_global_scale * b_global_scale)
-
-    if cfg["no_a_quant"]:
-        # Pre-quantize activation
-        a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
-
-        def run():
-            return ops.cutlass_scaled_fp4_mm(
-                a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
-            )
-
-        return run
-
-    # Quantize activation on-the-fly
-    def run():
-        a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
-        return ops.cutlass_scaled_fp4_mm(
-            a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
-        )
-
-    return run
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
-        x_log=False,
-        line_arg="provider",
-        line_vals=_enabled,
-        line_names=_enabled,
-        ylabel="TFLOP/s (larger is better)",
-        plot_name="BF16 vs NVFP4 GEMMs",
-        args={},
-    )
-)
-def benchmark(batch_size, provider, N, K):
-    M = batch_size
-    device = "cuda"
-    dtype = torch.bfloat16
-
-    a = torch.randn((M, K), device=device, dtype=dtype)
-    b = torch.randn((N, K), device=device, dtype=dtype)
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch-bf16":
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
-        )
-    else:
-        cfg = PROVIDER_CFGS[provider]
-        run_quant = build_nvfp4_runner(cfg, a, b, dtype, device)
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: run_quant(), quantiles=quantiles
-        )
-
-    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
-    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
-
-
-def prepare_shapes(args):
-    out = []
-    for model, tp_size in itertools.product(args.models, args.tp_sizes):
-        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_dim] //= tp_size
-            KN.append(model)
-            out.append(KN)
-    return out
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=["meta-llama/Llama-3.1-8B-Instruct"],
-        choices=list(WEIGHT_SHAPES.keys()),
-    )
-    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
-    args = parser.parse_args()
-
-    for K, N, model in prepare_shapes(args):
-        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
-        benchmark.run(
-            print_data=True,
-            show_plots=True,
-            save_path=f"bench_nvfp4_res_n{N}_k{K}",
-            N=N,
-            K=K,
-        )
-
-    print("Benchmark finished!")
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,98 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import itertools
-from typing import Callable
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.triton_utils import triton
-
-
-# TODO(luka): use standalone_compile utility
-def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
-    def inner(*args):
-        torch._dynamo.mark_dynamic(args[arg_index], dim_index)
-        return fn(*args)
-
-    return inner
-
-
-torch._dynamo.config.recompile_limit = 8888
-compilation_config = CompilationConfig(custom_ops=["none"])
-with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
-    torch_per_token_quant_fp8 = torch.compile(
-        QuantFP8(False, GroupShape.PER_TOKEN),
-        fullgraph=True,
-        dynamic=False,  # recompile for different shapes
-    )
-
-    # First dim is explicitly dynamic to simulate vLLM usage
-    torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
-
-
-def cuda_per_token_quant_fp8(
-    input: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    return ops.scaled_fp8_quant(input)
-
-
-def calculate_diff(batch_size: int, seq_len: int):
-    """Calculate difference between Triton and CUDA implementations."""
-    device = torch.device("cuda")
-    x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
-
-    torch_out, torch_scale = torch_per_token_quant_fp8(x)
-    cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
-
-    if torch.allclose(
-        cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
-    ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-
-configs = list(itertools.product(batch_size_range, seq_len_range))
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size", "seq_len"],
-        x_vals=configs,
-        line_arg="provider",
-        line_vals=["torch", "cuda"],
-        line_names=["Torch", "CUDA"],
-        styles=[("blue", "-"), ("green", "-")],
-        ylabel="us",
-        plot_name="per-token-dynamic-quant-fp8-performance",
-        args={},
-    )
-)
-def benchmark_quantization(batch_size, seq_len, provider):
-    dtype = torch.float16
-    device = torch.device("cuda")
-
-    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch":
-        fn = lambda: torch_per_token_quant_fp8(x.clone())
-    elif provider == "cuda":
-        fn = lambda: cuda_per_token_quant_fp8(x.clone())
-
-    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
-
-    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-
-if __name__ == "__main__":
-    calculate_diff(batch_size=4, seq_len=4096)
-    benchmark_quantization.run(print_data=True)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -86,9 +86,6 @@ def benchmark_config(
            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
        )
        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
-    if use_deep_gemm:
-        # we use the default block shape for deepgemm
-        block_quant_shape = [128, 128]
    if use_fp8_w8a8:
        if block_quant_shape:
            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
@ -576,11 +573,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in (
-        "DeepseekV3ForCausalLM",
-        "DeepseekV2ForCausalLM",
-        "Glm4MoeForCausalLM",
-    ):
+    elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
@ -590,11 +583,6 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
-        E = config.num_experts
-        topk = config.moe_topk[0]
-        intermediate_size = config.moe_intermediate_size[0]
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Support for llama4
        config = config.get_text_config()
@ -632,7 +620,7 @@ def main(args: argparse.Namespace):
            4096,
        ]
    else:
-        batch_sizes = args.batch_size
+        batch_sizes = [args.batch_size]

    use_deep_gemm = bool(args.use_deep_gemm)

@ -740,7 +728,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
+    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    parser.add_argument("--model-prefix", type=str, required=False)
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@ -5,8 +5,9 @@ import itertools

 import torch

+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size,
+    moe_align_block_size_triton,
 )
 from vllm.triton_utils import triton

@ -20,6 +21,62 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
    )


+def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
+    """
+    Verifies vllm vs. Triton
+    """
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    # 1. malloc space for triton and vllm
+    # malloc enough space (max_num_tokens_padded) for the sorted ids
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_triton = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
+    )
+    sorted_ids_triton.fill_(topk_ids.numel())  # fill with sentinel value
+    expert_ids_triton = torch.zeros(
+        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
+    sorted_ids_vllm.fill_(topk_ids.numel())
+    expert_ids_vllm = torch.zeros_like(expert_ids_triton)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
+
+    # 2. run implementations
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_vllm,
+        expert_ids_vllm,
+        num_tokens_post_pad_vllm,
+    )
+    print(f"✅ VLLM implementation works with {num_experts} experts!")
+
+    # 3. compare results
+    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
+        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
+    ):
+        print("✅ Triton and VLLM implementations match.")
+    else:
+        print("❌ Triton and VLLM implementations DO NOT match.")
+        print("Triton expert_ids:", expert_ids_triton)
+        print("VLLM expert_ids:", expert_ids_vllm)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+
+
 # test configurations
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
@ -32,8 +89,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
        x_names=["num_tokens", "num_experts", "topk"],
        x_vals=configs,
        line_arg="provider",
-        line_vals=["vllm"],
-        line_names=["vLLM"],
+        line_vals=["vllm", "triton"],  # "triton"
+        line_names=["VLLM", "Triton"],  # "Triton"
        plot_name="moe-align-block-size-performance",
        args={},
    )
@ -43,11 +100,37 @@ def benchmark(num_tokens, num_experts, topk, provider):
    block_size = 256
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)

+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
+
    quantiles = [0.5, 0.2, 0.8]

    if provider == "vllm":
        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+            lambda: ops.moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
            quantiles=quantiles,
        )

@ -71,4 +154,6 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

+    print("Running correctness check...")
+    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
    benchmark.run(print_data=True, show_plots=True)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -8,13 +8,12 @@ import ray
 import torch
 from transformers import AutoConfig

-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
    _moe_permute,
    _moe_unpermute_and_reduce,
-    moe_permute,
-    moe_unpermute,
 )
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
@ -64,19 +63,18 @@ def benchmark_permute(

    def run():
        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
            )
        else:
            (
@ -152,19 +150,18 @@ def benchmark_unpermute(

    def prepare():
        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
            )
            # convert to fp16/bf16 as gemm output
            return (
@ -194,19 +191,16 @@ def benchmark_unpermute(

    def run(input: tuple):
        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = input
-            output = torch.empty_like(hidden_states)
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
            moe_unpermute(
-                output,
                permuted_hidden_states,
                topk_weights,
+                topk_ids,
                inv_perm_idx,
                first_token_off,
+                topk,
+                num_experts,
+                num_experts,
            )
        else:
            (
@ -217,11 +211,7 @@ def benchmark_unpermute(
                inv_perm,
            ) = input
            _moe_unpermute_and_reduce(
-                output_hidden_states,
-                permuted_hidden_states,
-                inv_perm,
-                topk_weights,
-                True,
+                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
            )

    # JIT compilation & warmup
@ -328,7 +318,6 @@ def main(args: argparse.Namespace):
    elif (
        config.architectures[0] == "DeepseekV3ForCausalLM"
        or config.architectures[0] == "DeepseekV2ForCausalLM"
-        or config.architectures[0] == "Glm4MoeForCausalLM"
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
--- a/benchmarks/kernels/benchmark_trtllm_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_attention.py
@ -1,240 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import csv
-import os
-import random
-from datetime import datetime
-
-import flashinfer
-import torch
-
-FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
-
-
-def to_float8(x, dtype=torch.float8_e4m3fn):
-    finfo = torch.finfo(dtype)
-    min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
-    scale = finfo.max / amax * 0.1
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
-    return x_scl_sat.to(dtype), scale.float().reciprocal()
-
-
-@torch.no_grad()
-def benchmark_decode(
-    num_seqs,
-    max_seq_len,
-    page_size=16,
-    dtype=torch.bfloat16,
-    kv_layout="HND",
-    num_kv_heads=8,
-    kv_cache_dtype="auto",
-    head_dim=128,
-    warmup=10,
-    trials=20,
-):
-    torch.set_default_device("cuda")
-    device = "cuda"
-    torch.manual_seed(0)
-
-    # Currently only HEAD_GRP_SIZE == 8 is supported
-    HEAD_GRP_SIZE = 8
-    MAX_SEQ_LEN = max_seq_len
-
-    # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / page_size)
-
-    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
-
-    # For decode, batch_size is num_decode_token
-    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
-    sm_scale = float(1.0 / (head_dim**0.5))
-    q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
-    kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-
-    max_kv_len = max(kv_lens)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
-    max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
-
-    block_tables = torch.randint(
-        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
-    )
-
-    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
-    k_scale = v_scale = 1.0
-
-    if kv_cache_dtype.startswith("fp8"):
-        kv_cache, _ = to_float8(kv_cache)
-
-    # Benchmark TRT decode
-    def trt_decode():
-        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-            q,
-            kv_cache,
-            workspace_buffer,
-            num_qo_heads,
-            num_kv_heads,
-            sm_scale,
-            block_tables,
-            kv_lens_tensor,
-            page_size,
-            max_kv_len,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        times = []
-        for i in range(warmup):
-            fn()
-        for i in range(trials):
-            start.record()
-            fn()
-            end.record()
-            torch.cuda.synchronize()
-            times.append(start.elapsed_time(end))  # ms
-        return sum(times) / len(times), torch.std(torch.tensor(times))
-
-    # TRT Decode
-    trt_mean, trt_std = time_fn(trt_decode)
-
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + page_size - 1) // page_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % page_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = page_size
-        kv_last_page_lens.append(kv_last_page_len)
-
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout,
-        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
-    )
-
-    wrapper.plan(
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        num_qo_heads,
-        num_kv_heads,
-        head_dim,
-        page_size,
-        "NONE",
-        q_data_type=dtype,
-        kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
-    )
-
-    def baseline_decode():
-        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
-
-    baseline_mean, baseline_std = time_fn(baseline_decode)
-
-    # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
-
-    print(
-        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
-        f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
-    )
-
-    # Return results for CSV writing
-    return {
-        "num_seqs": num_seqs,
-        "trt_mean": trt_mean,
-        "trt_std": trt_std.item(),
-        "baseline_mean": baseline_mean,
-        "baseline_std": baseline_std.item(),
-        "speedup_percent": speedup_percent,
-        "q_dtype": str(dtype),
-        "kv_cache_dtype": kv_cache_dtype,
-        "page_size": page_size,
-        "num_kv_heads": num_kv_heads,
-        "head_dim": head_dim,
-        "max_seq_len": max_seq_len,
-    }
-
-
-def write_results_to_csv(results, filename=None):
-    """Write benchmark results to CSV file."""
-    if filename is None:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
-
-    fieldnames = [
-        "num_seqs",
-        "trt_mean",
-        "trt_std",
-        "baseline_mean",
-        "baseline_std",
-        "speedup_percent",
-        "q_dtype",
-        "kv_cache_dtype",
-        "page_size",
-        "num_kv_heads",
-        "head_dim",
-        "max_seq_len",
-    ]
-
-    file_exists = os.path.exists(filename)
-
-    with open(filename, "a", newline="") as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-
-        if not file_exists:
-            writer.writeheader()
-
-        for result in results:
-            writer.writerow(result)
-
-    print(f"Results written to {filename}")
-
-
-if __name__ == "__main__":
-    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
-    max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
-    all_results = []
-
-    print("Running benchmark for kv_cache_dtype: bfloat16")
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
-            )
-            all_results.append(result)
-
-    print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
-            )
-            all_results.append(result)
-
-    # Write all results to CSV
-    write_results_to_csv(all_results)
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ b/benchmarks/kv_cache/benchmark_block_pool.py
@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-import time
-from typing import Optional
-
-from tabulate import tabulate
-
-from vllm.utils import FlexibleArgumentParser
-from vllm.v1.core.block_pool import BlockPool
-
-
-class Metric:
-    def __init__(self) -> None:
-        self.cnt: int = 0
-        self.sum_v: int = 0
-        self.max_v: Optional[int] = None
-
-    def update(self, v: int) -> None:
-        self.cnt += 1
-        self.sum_v += v
-        if self.max_v is None:
-            self.max_v = v
-        else:
-            self.max_v = max(self.max_v, v)
-
-    def avg_v(self) -> float:
-        return self.sum_v * 1.0 / self.cnt
-
-
-def main(args):
-    rows = []
-    for allocate_block in args.allocate_blocks:
-        # Enforce a GC collect ahead to minimize the impact among runs
-        gc.collect()
-        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
-
-        get_blocks_metric: Metric = Metric()
-        free_blocks_metric: Metric = Metric()
-        for _ in range(args.num_iteration):
-            t1 = time.monotonic_ns()
-            blocks = block_pool.get_new_blocks(allocate_block)
-            t2 = time.monotonic_ns()
-            block_pool.free_blocks(blocks)
-            t3 = time.monotonic_ns()
-            get_blocks_metric.update(t2 - t1)
-            free_blocks_metric.update(t3 - t2)
-
-        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
-            rows.append(
-                [
-                    get_blocks_metric.cnt,
-                    args.num_gpu_blocks,
-                    allocate_block,
-                    get_blocks_metric.avg_v() / 1000000,
-                    get_blocks_metric.max_v / 1000000.0,
-                    free_blocks_metric.avg_v() / 1000000,
-                    free_blocks_metric.max_v / 1000000.0,
-                ]
-            )
-        else:
-            print(
-                "No valid metrics found."
-                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
-            )
-
-    print(
-        tabulate(
-            rows,
-            headers=[
-                "Iterations",
-                "Total\nBlocks",
-                "Allocated\nBlocks",
-                "Get Blocks\nAvg (ms)",
-                "Get Blocks\nMax (ms)",
-                "Free Blocks\nAvg (ms)",
-                "Free Blocks\nMax (ms)",
-            ],
-            tablefmt="grid",
-            floatfmt=".6f",
-        )
-    )
-
-
-def invoke_main() -> None:
-    parser = FlexibleArgumentParser(
-        description="Benchmark the performance of BlockPool for KV Cache."
-    )
-    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
-    parser.add_argument(
-        "--num-iteration",
-        type=int,
-        default=1000,
-        help="Number of iterations to run to stablize final data readings",
-    )
-    parser.add_argument(
-        "--allocate-blocks",
-        type=int,
-        nargs="*",
-        default=[10, 50, 100, 500, 1000],
-        help="Number of blocks to allocate",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-if __name__ == "__main__":
-    invoke_main()  # pragma: no cover
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -58,22 +58,6 @@ function (find_isa CPUINFO TARGET OUT)
    endif()
 endfunction()

-
-function(check_sysctl TARGET OUT)
-    execute_process(COMMAND sysctl -n "${TARGET}"
-                    RESULT_VARIABLE SYSCTL_RET
-                    OUTPUT_VARIABLE SYSCTL_INFO
-                    ERROR_QUIET
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(SYSCTL_RET EQUAL 0 AND
-      (SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0))
-        set(${OUT} ON PARENT_SCOPE)
-    else()
-        set(${OUT} OFF PARENT_SCOPE)
-    endif()
-endfunction()
-
-
 function (is_avx512_disabled OUT)
    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
@ -86,10 +70,7 @@ endfunction()
 is_avx512_disabled(AVX512_DISABLED)

 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-    message(STATUS "Apple Silicon Detected")
-    set(ENABLE_NUMA OFF)
-    check_sysctl(hw.optional.neon ASIMD_FOUND)
-    check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
+    set(APPLE_SILICON_FOUND TRUE)
 else()
    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
@ -101,6 +82,7 @@ else()
    find_isa(${CPUINFO} "S390" S390_FOUND)
 endif()

+
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    list(APPEND CXX_COMPILE_FLAGS
        "-mavx512f"
@ -167,6 +149,9 @@ elseif (ASIMD_FOUND)
        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
    endif()
    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 elseif (S390_FOUND)
    message(STATUS "S390 detected")
    # Check for S390 VXE support
@ -180,32 +165,17 @@ else()
 endif()

 #
-# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
-# Flag to enable ACL kernels for AARCH64 platforms
-if ( VLLM_BUILD_ACL STREQUAL "ON")
-    set(USE_ACL ON)
-else()
-    set(USE_ACL OFF)
-endif()
-
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+#
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.8.1
+        GIT_TAG  v3.7.1
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )

-    if(USE_ACL)
-        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
-        if(NOT ARM_COMPUTE_LIBRARY)
-            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
-        endif()
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-        endif()
-
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
    set(ONEDNN_BUILD_EXAMPLES "OFF")
@ -294,11 +264,6 @@ elseif(POWER10_FOUND)
        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()
-if (ASIMD_FOUND)
-    set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
-        ${VLLM_EXT_SRC})
-endif()

 message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")

--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -24,7 +24,6 @@

 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
-#include "../cuda_compat.h"

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -34,6 +33,12 @@ typedef __hip_bfloat16 __nv_bfloat16;
  #include "../quantization/fp8/nvidia/quant_utils.cuh"
 #endif

+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@ -665,6 +670,7 @@ __global__ void paged_attention_v2_reduce_kernel(

 }  // namespace vllm

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -1,372 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*
- * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
- * by Alcanderian JieXin Liang
- */
-
-/*!
- \file
- \brief An universal device layer for cutlass 3.x-style kernels.
-*/
-
-// clang-format off
-#pragma once
-
-// common
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-
-#if !defined(__CUDACC_RTC__)
-#include "cutlass/cluster_launch.hpp"
-#include "cutlass/trace.h"
-#endif // !defined(__CUDACC_RTC__)
-
-#include "../kernel/sm100_fmha_mla_tma_warpspecialized.hpp"
-#include "../kernel/sm100_fmha_mla_reduction.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::fmha::device {
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////// CUTLASS 3.x API /////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template<
-    class Kernel_
->
-class MLA {
-public:
-
-  using Kernel = Kernel_;
-
-  using ReductionKernel = cutlass::fmha::kernel::Sm100FmhaMlaReductionKernel<
-      typename Kernel::ElementOut,
-      typename Kernel::ElementAcc,
-      typename Kernel::ElementAcc,
-      Kernel::TileShapeH::value,
-      Kernel::TileShapeL::value,
-      256 /*Max split*/
-  >;
-
-  /// Argument structure: User API
-  using KernelArguments = typename Kernel::Arguments;
-  using ReductionArguments = typename ReductionKernel::Arguments;
-
-  using Arguments = KernelArguments;
-
-  /// Argument structure: Kernel API
-  using KernelParams = typename Kernel::Params;
-  using ReductionParams = typename ReductionKernel::Params;
-  struct Params {
-    KernelParams fmha_params;
-    ReductionParams reduction_params;
-  };
-
-private:
-
-  /// Kernel API parameters object
-  Params params_;
-
-  bool is_initialized(bool set = false) {
-    static bool initialized = false;
-    if (set) initialized = true;
-    return initialized;
-  }
-
-  static ReductionArguments to_reduction_args(Arguments const& args) {
-    auto [H, K, D, B] = args.problem_shape;
-    return ReductionArguments{
-      nullptr, args.epilogue.ptr_o, nullptr, args.epilogue.ptr_lse,
-      args.mainloop.softmax_scale, B, args.split_kv, K, args.mainloop.ptr_seq,
-      args.ptr_split_kv, Kernel::TileShapeS::value
-    };
-  }
-
-public:
-
-  /// Access the Params structure
-  Params const& params() const {
-    return params_;
-  }
-
-  static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
-    if (args.split_kv >= 1) return;
-    auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
-    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
-    // printf("    max_splits = %d\n", max_splits);
-    int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
-    int split_heur = min(max_splits, sms_per_batch);
-    int waves = ceil_div(B * split_heur, sm_count);
-    int k_waves = ceil_div(max_splits, split_heur);
-    int split_wave_aware = ceil_div(max_splits, k_waves);
-    args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status
-  can_implement(Arguments const& args) {
-    if (! Kernel::can_implement(args)) {
-      return Status::kInvalid;
-    }
-    if (! ReductionKernel::can_implement(to_reduction_args(args))) {
-      return Status::kInvalid;
-    }
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_bytes = 0;
-    workspace_bytes += Kernel::get_workspace_size(args);
-    workspace_bytes += ReductionKernel::get_workspace_size(to_reduction_args(args));
-    return workspace_bytes;
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
-    CUTLASS_TRACE_HOST("MLA::maximum_active_blocks()");
-    int max_active_blocks = -1;
-    int smem_size = Kernel::SharedStorageSize;
-
-    // first, account for dynamic smem capacity if needed
-    cudaError_t result;
-    if (smem_size >= (48 << 10)) {
-      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-      result = cudaFuncSetAttribute(
-          device_kernel<Kernel>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          smem_size);
-      if (cudaSuccess != result) {
-        result = cudaGetLastError(); // to clear the error bit
-        CUTLASS_TRACE_HOST(
-          "  cudaFuncSetAttribute() returned error: "
-          << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    // query occupancy after setting smem size
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        device_kernel<Kernel>,
-        Kernel::MaxThreadsPerBlock,
-        smem_size);
-
-    if (cudaSuccess != result) {
-      result = cudaGetLastError(); // to clear the error bit
-      CUTLASS_TRACE_HOST(
-        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
-        << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status
-  initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
-    CUTLASS_TRACE_HOST("MLA::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize the workspace
-    Status status = Kernel::initialize_workspace(args, workspace, stream);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    status = ReductionKernel::initialize_workspace(to_reduction_args(args), workspace, stream);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    KernelParams kernel_params = Kernel::to_underlying_arguments(args, workspace);
-
-    ReductionArguments reduction_args = to_reduction_args(args);
-    if (reduction_args.split_kv > 1) {
-      reduction_args.ptr_oaccum   = kernel_params.epilogue.ptr_o_acc;
-      reduction_args.ptr_lseaccum = kernel_params.epilogue.ptr_lse_acc;
-    }
-    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
-    // Initialize the Params structure
-    params_ = Params {kernel_params, reduction_params};
-
-    if (is_initialized()) return Status::kSuccess;
-
-    // account for dynamic smem capacity if needed
-    // no dynamic smem is needed for reduction kernel
-    int smem_size = Kernel::SharedStorageSize;
-    if (smem_size >= (48 << 10)) {
-      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-      cudaError_t result = cudaFuncSetAttribute(
-          device_kernel<Kernel>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          smem_size);
-      if (cudaSuccess != result) {
-        result = cudaGetLastError(); // to clear the error bit
-        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    is_initialized(true);
-
-    return Status::kSuccess;
-  }
-
-  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
-  Status
-  update(Arguments const& args, void* workspace = nullptr) {
-    CUTLASS_TRACE_HOST("MLA()::update() - workspace: " << workspace);
-
-    size_t workspace_bytes = get_workspace_size(args);
-    if (workspace_bytes > 0 && nullptr == workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    auto fmha_params = Kernel::to_underlying_arguments(args, workspace);
-
-    ReductionArguments reduction_args = to_reduction_args(args);
-    if (reduction_args.split_kv > 1) {
-      reduction_args.ptr_oaccum   = fmha_params.epilogue.ptr_o_acc;
-      reduction_args.ptr_lseaccum = fmha_params.epilogue.ptr_lse_acc;
-    }
-    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
-    // Initialize the Params structure
-    params_ = Params {fmha_params, reduction_params};
-
-    return Status::kSuccess;
-  }
-
-  /// Primary run() entry point API that is static allowing users to create and manage their own params.
-  /// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
-  static Status
-  run(Params& params, cudaStream_t stream = nullptr) {
-    CUTLASS_TRACE_HOST("MLA::run()");
-    dim3 const block = Kernel::get_block_shape();
-    dim3 const grid = Kernel::get_grid_shape(params.fmha_params);
-
-    // configure smem size and carveout
-    int smem_size = Kernel::SharedStorageSize;
-
-    Status launch_result;
-    // Use extended launch API only for mainloops that use it
-    if constexpr(Kernel::ArchTag::kMinComputeCapability >= 90) {
-      dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
-                   cute::size<1>(typename Kernel::ClusterShape{}),
-                   cute::size<2>(typename Kernel::ClusterShape{}));
-      void const* kernel = (void const*) device_kernel<Kernel>;
-      void* kernel_params[] = {&params.fmha_params};
-      launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
-    }
-    else {
-      launch_result = Status::kSuccess;
-      device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params.fmha_params);
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess != result or Status::kSuccess != launch_result) {
-      //return Status::kSuccess;
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-      return Status::kErrorInternal;
-    }
-    if (params.reduction_params.split_kv > 1) {
-      // launch reduction kernel
-      dim3 const block = ReductionKernel::get_block_shape();
-      dim3 const grid  = ReductionKernel::get_grid_shape(params.reduction_params);
-      device_kernel<ReductionKernel><<<grid, block, 0, stream>>>(params.reduction_params);
-      cudaError_t result = cudaGetLastError();
-      if (cudaSuccess == result) {
-        return Status::kSuccess;
-      }
-      else {
-        CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-        return Status::kErrorInternal;
-      }
-    }
-    else {
-      return Status::kSuccess;
-    }
-  }
-
-  //
-  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
-  //
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
-    Status status = initialize(args, workspace, stream);
-    if (Status::kSuccess == status) {
-      status = run(params_, stream);
-    }
-    return status;
-  }
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
-    return run(args, workspace, stream);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  run(cudaStream_t stream = nullptr) {
-    return run(params_, stream);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  operator()(cudaStream_t stream = nullptr) {
-    return run(params_, stream);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::fmha::device
-
-////////////////////////////////////////////////////////////////////////////////
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
@ -1,203 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*
- * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
- * by Alcanderian JieXin Liang
- */
-
-// clang-format off
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/arch.h"
-#include "cute/tensor.hpp"
-
-namespace cutlass::fmha::kernel {
-
-using namespace cute;
-template<
-    class ElementOut,
-    class ElementAcc,
-    class ElementScale,
-    size_t kNumHeads,
-    size_t kHeadDimLatent,
-    int kMaxSplits
->
-struct Sm100FmhaMlaReductionKernel {
-
-  static const int SharedStorageSize = 0;
-  static const int MaxThreadsPerBlock = 128;
-  static const int MinBlocksPerMultiprocessor = 1;
-
-  using ArchTag = cutlass::arch::Sm100;
-
-  static_assert(kHeadDimLatent % MaxThreadsPerBlock == 0);
-  struct Arguments {
-    ElementAcc* ptr_oaccum = nullptr;
-    ElementOut* ptr_o = nullptr;
-    ElementAcc* ptr_lseaccum = nullptr;
-    ElementAcc* ptr_lse = nullptr;
-    ElementScale scale = 1.f;
-    int num_batches = 0;
-    int split_kv = -1;
-    int dim_k = -1;
-    int* ptr_seq = nullptr;
-    int* ptr_split_kv = nullptr;
-    int tile_shape_s = 128;
-  };
-  using Params = Arguments;
-
-  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
-    return {args.ptr_oaccum, args.ptr_o, args.ptr_lseaccum, args.ptr_lse,
-	    args.scale, args.num_batches, args.split_kv, args.dim_k, args.ptr_seq,
-	    args.ptr_split_kv, args.tile_shape_s};
-  }
-
-  static size_t get_workspace_size(Arguments const& /*args*/) {
-    return 0;
-  }
-
-  static Status initialize_workspace(
-      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
-    return Status::kSuccess;
-  }
-
-  static dim3 get_grid_shape(Params const& params) {
-    return dim3(kNumHeads, 1, params.num_batches);
-  }
-
-  static dim3 get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  static bool can_implement(Arguments const& args) {
-    if (args.num_batches <= 0) return false;
-    if (args.split_kv <= 0) return false;
-    return true;
-  }
-
-  CUTLASS_DEVICE void operator() (Params const& params, char* smem_raw) {
-    if (params.split_kv <= 1) return;
-    auto blk_coord = make_coord(blockIdx.x, _0{}, blockIdx.z);
-
-    __shared__ ElementAcc sLseScale[kMaxSplits];
-    const size_t offset_lseaccum = get<0>(blk_coord) + kNumHeads * params.split_kv * get<2>(blk_coord);
-    const size_t offset_lse = get<0>(blk_coord) + kNumHeads * get<2>(blk_coord);
-
-    Tensor gLSEaccum = make_tensor(make_gmem_ptr(params.ptr_lseaccum + offset_lseaccum),
-                                   make_shape(params.split_kv), Stride<Int<kNumHeads>>{});
-
-    Tensor gLSE = make_tensor(make_gmem_ptr(params.ptr_lse + offset_lse),
-                              Shape<_1>{}, Stride<_1>{});
-
-    auto dim_k = params.ptr_seq == nullptr ?  params.dim_k : params.ptr_seq[get<2>(blk_coord)];
-    auto local_split_kv = params.ptr_split_kv == nullptr ? params.split_kv : params.ptr_split_kv[get<2>(blk_coord)];
-    auto k_tile_total = ceil_div(dim_k, params.tile_shape_s);
-    auto k_tile_per_cta = ceil_div(k_tile_total, local_split_kv);
-    local_split_kv = ceil_div(k_tile_total, k_tile_per_cta);
-
-    int warp_idx = cutlass::canonical_warp_idx_sync();
-    if (warp_idx == 0) {
-      constexpr int kNLsePerThread = cute::ceil_div(kMaxSplits, 32);
-
-      ElementAcc local_lse[kNLsePerThread];
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kNLsePerThread; ++i) {
-        const int split = i * 32 + threadIdx.x;
-        local_lse[i] = split < local_split_kv ? gLSEaccum(split) : -std::numeric_limits<ElementAcc>::infinity();
-      }
-
-      ElementAcc lse_max = -std::numeric_limits<ElementAcc>::infinity();
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kNLsePerThread; ++i) {
-        lse_max = max(lse_max, local_lse[i]);
-      }
-      CUTLASS_PRAGMA_UNROLL
-      for (int offset = 16; offset >= 1; offset /= 2) {
-        lse_max = max(lse_max, __shfl_xor_sync(0xffffffff, lse_max, offset));
-      }
-      lse_max = lse_max == -std::numeric_limits<ElementAcc>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
-      lse_max = __shfl_sync(0xffffffff, lse_max, 0);
-
-      ElementAcc sum_lse = 0;
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kNLsePerThread; ++i) {
-        sum_lse = sum_lse + expf(local_lse[i] - lse_max);
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int offset = 16; offset >= 1; offset /= 2) {
-        sum_lse = sum_lse + __shfl_xor_sync(0xffffffff, sum_lse, offset);
-      }
-
-      sum_lse = __shfl_sync(0xffffffff, sum_lse, 0);
-
-      ElementAcc global_lse = (sum_lse == 0.f || sum_lse != sum_lse) ? std::numeric_limits<ElementAcc>::infinity() : logf(sum_lse) + lse_max;
-      if (threadIdx.x == 0 and params.ptr_lse != nullptr) {
-        gLSE(0) = global_lse;
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kNLsePerThread; ++i) {
-        const int split = i * 32 + threadIdx.x;
-        if (split < local_split_kv) {
-          sLseScale[split] = expf(local_lse[i] - global_lse);
-        }
-      }
-    }
-    __syncthreads();
-
-    constexpr int Elements = kHeadDimLatent / MaxThreadsPerBlock;
-    const size_t offset_oaccum = kHeadDimLatent * params.split_kv * (get<0>(blk_coord) + kNumHeads * get<2>(blk_coord));
-    Tensor gOaccum = make_tensor(make_gmem_ptr(params.ptr_oaccum + offset_oaccum),
-                               Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
-    ElementAcc local_val[Elements] = {0};
-    for (int split = 0; split < local_split_kv; ++split) {
-      ElementAcc lse_scale = sLseScale[split];
-      CUTLASS_PRAGMA_UNROLL
-      for(int i = 0; i < Elements; ++i) {
-        local_val[i] += lse_scale * gOaccum(threadIdx.x + MaxThreadsPerBlock * i);
-      }
-      gOaccum.data() = gOaccum.data() + kHeadDimLatent;
-    }
-    auto ptr_o_local = params.ptr_o + (get<0>(blk_coord) + get<2>(blk_coord) * kNumHeads) * kHeadDimLatent;
-    Tensor gO = make_tensor(make_gmem_ptr(ptr_o_local), Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i = 0; i < Elements; ++i) {
-      gO(threadIdx.x + MaxThreadsPerBlock * i) = static_cast<ElementOut>(local_val[i]);
-    }
-  }
-};
-
-}  // namespace cutlass::fmha::kernel
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
@ -1,165 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*
- * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
- * by Alcanderian JieXin Liang
- */
-
-// clang-format off
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.h"
-
-namespace cutlass::fmha::kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct Sm100MlaIndividualTileScheduler {
-
-  struct Params {
-    dim3 grid;
-  };
-
-  bool valid_ = true;
-
-  CUTLASS_DEVICE
-  Sm100MlaIndividualTileScheduler(Params const&) {}
-
-  template<class ProblemShape, class ClusterShape>
-  static Params to_underlying_arguments(
-      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
-      ClusterShape const& cluster_shape, int const& split_kv) {
-    using namespace cute;
-    dim3 grid(get<0>(cluster_shape), get<3>(problem_shape) /* Batch */, split_kv /*Maximum Split KV*/);
-    return Params{ grid };
-  }
-
-  static dim3 get_grid_shape(Params const& params) {
-    return params.grid;
-  }
-
-  CUTLASS_DEVICE
-  bool is_valid() {
-    return valid_;
-  }
-
-  CUTLASS_DEVICE
-  auto get_block_coord() {
-    using namespace cute;
-    return make_coord(blockIdx.x, _0{}, blockIdx.y, blockIdx.z);
-  }
-
-  CUTLASS_DEVICE
-  Sm100MlaIndividualTileScheduler& operator++() {
-    valid_ = false;
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct Sm100MlaPersistentTileScheduler {
-
-  struct Params {
-    int num_blocks;
-    FastDivmod divmod_m_block;
-    FastDivmod divmod_b;
-    FastDivmod divmod_split_kv;
-    KernelHardwareInfo hw_info;
-  };
-
-  int block_idx = 0;
-  Params params;
-
-  CUTLASS_DEVICE
-  Sm100MlaPersistentTileScheduler(Params const& params) : block_idx(blockIdx.x), params(params) {}
-
-  template<class ProblemShape, class ClusterShape>
-  static Params to_underlying_arguments(
-      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
-      ClusterShape const& cluster_shape, int const& split_kv) {
-    using namespace cute;
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = hw_info.sm_count;
-    if (sm_count <= 1 || sm_count % size<0>(cluster_shape) != 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-    hw_info.sm_count = sm_count;
-
-    int num_m_blocks = size<0>(cluster_shape);
-    int num_blocks = num_m_blocks * get<3>(problem_shape)  /* Batch */;
-    num_blocks *= split_kv; /* Maximum Split KV*/
-
-    return Params {
-      num_blocks,
-      { num_m_blocks}, { get<3>(problem_shape) }, {split_kv},
-      hw_info
-    };
-  }
-
-  static dim3 get_grid_shape(Params const& params) {
-    dim3 grid(std::min(params.num_blocks, params.hw_info.sm_count), 1, 1);
-    return grid;
-  }
-
-  CUTLASS_DEVICE
-  bool is_valid() {
-    return block_idx < params.num_blocks;
-  }
-
-  CUTLASS_DEVICE
-  auto get_block_coord() {
-    using namespace cute;
-    int block_decode = block_idx;
-    int m_block, bidb, n_split_kv;
-    params.divmod_m_block(block_decode, m_block, block_decode);
-    params.divmod_b(block_decode, bidb, block_decode);
-    params.divmod_split_kv(block_decode, n_split_kv, block_decode);
-    return make_coord(m_block, _0{}, bidb, n_split_kv);
-  }
-
-  CUTLASS_DEVICE
-  Sm100MlaPersistentTileScheduler& operator++() {
-    block_idx += gridDim.x;
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::fmha::kernel
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -1,283 +0,0 @@
-/*
-Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-Copyright 2025 SGLang Team. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/*
- * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
- * by Alcanderian JieXin Liang
- */
-#include "core/registration.h"
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/kernel_hardware_info.h>
-#include <torch/all.h>
-
-#include <cute/tensor.hpp>
-#include <iostream>
-
-#include "cutlass_sm100_mla/device/sm100_mla.hpp"
-#include "cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp"
-
-// clang-format off
-#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
-void sm100_cutlass_mla_decode(
-    torch::Tensor const& out,
-    torch::Tensor const& q_nope,
-    torch::Tensor const& q_pe,
-    torch::Tensor const& kv_c_and_k_pe_cache,
-    torch::Tensor const& seq_lens,
-    torch::Tensor const& page_table,
-    torch::Tensor const& workspace,
-    int64_t num_kv_splits) {
-  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
-}
-int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
-  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_get_workspace_size");
-}
-#else
-
-#define CUTLASS_CHECK(status)                                                       \
-  {                                                                                 \
-    cutlass::Status error = status;                                                 \
-    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
-  }
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template <bool v>
-struct IsPersistent {
-  static const bool value = v;
-};
-
-template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
-struct MlaSm100 {
-  using Element = T;
-  using ElementAcc = float;
-  using ElementOut = T;
-
-  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
-  using TileShapeH = cute::tuple_element_t<0, TileShape>;
-  using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
-  // H K (D_latent D_rope) B
-  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
-
-  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
-  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
-  using StrideO = StrideK;                            // H D B
-  using StrideLSE = cute::tuple<_1, int>;             // H B
-
-  using TileScheduler =
-      std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler, Sm100MlaIndividualTileScheduler>;
-
-  using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
-      TileShape,
-      Element,
-      ElementAcc,
-      ElementOut,
-      ElementAcc,
-      TileScheduler,
-      /*kIsCpAsync=*/!IsPaged128>;
-  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
-};
-
-template <typename T>
-typename T::Fmha::Arguments args_from_options(
-    at::Tensor const& out,
-    at::Tensor const& q_nope,
-    at::Tensor const& q_pe,
-    at::Tensor const& kv_c_and_k_pe_cache,
-    at::Tensor const& seq_lens,
-    at::Tensor const& page_table,
-    double sm_scale,
-    int64_t num_kv_splits) {
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = q_nope.device().index();
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-
-  int batches = q_nope.sizes()[0];
-  int page_count_per_seq = page_table.sizes()[1];
-  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
-  int page_size = kv_c_and_k_pe_cache.sizes()[1];
-  int max_seq_len = page_size * page_count_per_seq;
-  using TileShapeH = typename T::TileShapeH;
-  using TileShapeD = typename T::TileShapeD;
-  auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
-  auto [H, K, D, B] = problem_shape;
-  auto [D_latent, D_rope] = D;
-
-  float scale = float(sm_scale);
-
-  using StrideQ = typename T::StrideQ;
-  using StrideK = typename T::StrideK;
-  using StrideO = typename T::StrideO;
-  using StrideLSE = typename T::StrideLSE;
-
-  StrideQ stride_Q_nope = cute::make_tuple(
-      static_cast<int64_t>(q_nope.stride(1)), _1{}, static_cast<int64_t>(q_nope.stride(0)));
-  StrideQ stride_Q_pe = cute::make_tuple(
-      static_cast<int64_t>(q_pe.stride(1)), _1{}, static_cast<int64_t>(q_pe.stride(0)));
-
-  StrideK stride_C = cute::make_tuple(
-      static_cast<int64_t>(0 + D_latent + D_rope), _1{}, static_cast<int64_t>(page_size * (D_latent + D_rope)));
-  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
-  StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
-  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{}, static_cast<int64_t>(0 + H * D_latent));
-
-  using Element = typename T::Element;
-  using ElementOut = typename T::ElementOut;
-  using ElementAcc = typename T::ElementAcc;
-  auto Q_nope_ptr = static_cast<Element*>(q_nope.data_ptr());
-  auto Q_pe_ptr = static_cast<Element*>(q_pe.data_ptr());
-  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
-  typename T::Fmha::Arguments arguments{
-      problem_shape,
-      {scale,
-       Q_nope_ptr,
-       stride_Q_nope,
-       Q_pe_ptr,
-       stride_Q_pe,
-       C_ptr,
-       stride_C,
-       C_ptr + D_latent,
-       stride_C,
-       static_cast<int*>(seq_lens.data_ptr()),
-       static_cast<int*>(page_table.data_ptr()),
-       stride_PT,
-       page_count_total,
-       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
-      hw_info,
-      // TODO(trevor-m): Change split_kv back to -1 when
-      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
-      // perform worse with larger context length and smaller batch sizes.
-      num_kv_splits, // split_kv
-      nullptr,       // is_var_split_kv
-  };
-  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
-  // split_kv automatically based on batch size and sequence length to balance
-  // workload across available SMs. Consider using var_split_kv for manual
-  // control if needed.
-  T::Fmha::set_split_kv(arguments);
-  return arguments;
-}
-
-template <typename Element, bool IsPaged128, typename PersistenceOption>
-void runMla(
-    at::Tensor const& out,
-    at::Tensor const& q_nope,
-    at::Tensor const& q_pe,
-    at::Tensor const& kv_c_and_k_pe_cache,
-    at::Tensor const& seq_lens,
-    at::Tensor const& page_table,
-    at::Tensor const& workspace,
-    double sm_scale,
-    int64_t num_kv_splits,
-    cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
-  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
-
-  CUTLASS_CHECK(fmha.can_implement(arguments));
-
-  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
-  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-#define DISPATCH_BOOL(expr, const_expr, ...) \
-  [&]() -> bool {                            \
-    if (expr) {                              \
-      constexpr bool const_expr = true;      \
-      return __VA_ARGS__();                  \
-    } else {                                 \
-      constexpr bool const_expr = false;     \
-      return __VA_ARGS__();                  \
-    }                                        \
-  }()
-
-void sm100_cutlass_mla_decode(
-    torch::Tensor const& out,
-    torch::Tensor const& q_nope,
-    torch::Tensor const& q_pe,
-    torch::Tensor const& kv_c_and_k_pe_cache,
-    torch::Tensor const& seq_lens,
-    torch::Tensor const& page_table,
-    torch::Tensor const& workspace,
-    double sm_scale,
-    int64_t num_kv_splits) {
-  auto in_dtype = q_nope.dtype();
-  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device());
-  const int page_size = kv_c_and_k_pe_cache.sizes()[1];
-  
-  // NOTE(alcanderian): IsPersistent has bug with manual split_kv.
-  // Kernel will hang if batch is too large with large num_kv_splits. (for example bs=8, num_kv_splits=8)
-  // Maybe per batch split kv will fix this.
-  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
-    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
-      if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
-      } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
-      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
-      } else {
-        TORCH_CHECK(false, "Unsupported input data type of MLA");
-      }
-      return true;
-    });
-    return true;
-  });
-}
-
-int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
-  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
-  // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
-
-  // Get split kv. Requires problem shape and sm_count only.
-  typename MlaSm100Type::Fmha::Arguments arguments;
-  using TileShapeH = typename MlaSm100Type::TileShapeH;
-  using TileShapeD = typename MlaSm100Type::TileShapeD;
-  arguments.problem_shape =
-      cute::make_tuple(TileShapeH{}, static_cast<int>(max_seq_len), TileShapeD{}, static_cast<int>(num_batches));
-  // Assumes device 0 when getting sm_count.
-  arguments.hw_info.sm_count =
-      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
-  arguments.split_kv = num_kv_splits;
-  MlaSm100Type::Fmha::set_split_kv(arguments);
-
-  return MlaSm100Type::Fmha::get_workspace_size(arguments);
-}
-
-#endif
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
-  m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
-}
-
-// clang-format on
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@ -16,8 +16,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #include "attention_kernels.cuh"
-#include "../cuda_compat.h"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -74,7 +80,7 @@ void paged_attention_v1_launcher(
  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());

-  const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  int padded_max_seq_len =
      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
  int logits_size = padded_max_seq_len * sizeof(float);
@ -181,6 +187,7 @@ void paged_attention_v1(
                             CALL_V1_LAUNCHER_BLOCK_SIZE)
 }

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@ -16,8 +16,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #include "attention_kernels.cuh"
-#include "../cuda_compat.h"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -78,7 +84,7 @@ void paged_attention_v2_launcher(
  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());

-  const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
  int logits_size = PARTITION_SIZE * sizeof(float);
  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
@ -191,6 +197,7 @@ void paged_attention_v2(
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
 }

+#undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@ -33,8 +33,6 @@ namespace vec_op {
 #endif

 #define FORCE_INLINE __attribute__((always_inline)) inline
-// Number of elements in single ASIMD vector of given Datatype
-#define NUM_ELEMENTS_REG(vec) (sizeof(vec) / sizeof(vec[0]))

 namespace {
 template <typename T, T... indexes, typename F>
@ -88,8 +86,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  }

  void save(void* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
+    int full_blocks = elem_num / 8;
+    int remainder = elem_num % 8;

    if (full_blocks > 0) {
      vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
@ -199,25 +197,6 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
             vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {};

  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x2_t*>(ptr) = reg; };
-  void save(void* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-    for (int i = 0; i < full_blocks; i++)
-      vst1q_bf16(
-          reinterpret_cast<__bf16*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
-          reg.val[i]);
-    if (remainder > 0) {
-      bfloat16x8_t temp = reg.val[full_blocks];
-      bfloat16_t* base = reinterpret_cast<bfloat16_t*>(ptr) + full_blocks * 8;
-      if (remainder > 0) base[0] = vgetq_lane_bf16(temp, 0);
-      if (remainder > 1) base[1] = vgetq_lane_bf16(temp, 1);
-      if (remainder > 2) base[2] = vgetq_lane_bf16(temp, 2);
-      if (remainder > 3) base[3] = vgetq_lane_bf16(temp, 3);
-      if (remainder > 4) base[4] = vgetq_lane_bf16(temp, 4);
-      if (remainder > 5) base[5] = vgetq_lane_bf16(temp, 5);
-      if (remainder > 6) base[6] = vgetq_lane_bf16(temp, 6);
-    }
-  };
 };

 struct BF16Vec32 : public Vec<BF16Vec32> {
@ -234,25 +213,6 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {};

  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x4_t*>(ptr) = reg; };
-  void save(void* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-    for (int i = 0; i < full_blocks; i++)
-      vst1q_bf16(
-          reinterpret_cast<__bf16*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
-          reg.val[i]);
-    if (remainder > 0) {
-      bfloat16x8_t temp = reg.val[full_blocks];
-      bfloat16_t* base = reinterpret_cast<bfloat16_t*>(ptr) + full_blocks * 8;
-      base[0] = vgetq_lane_bf16(temp, 0);
-      if (remainder > 1) base[1] = vgetq_lane_bf16(temp, 1);
-      if (remainder > 2) base[2] = vgetq_lane_bf16(temp, 2);
-      if (remainder > 3) base[3] = vgetq_lane_bf16(temp, 3);
-      if (remainder > 4) base[4] = vgetq_lane_bf16(temp, 4);
-      if (remainder > 5) base[5] = vgetq_lane_bf16(temp, 5);
-      if (remainder > 6) base[6] = vgetq_lane_bf16(temp, 6);
-    }
-  };
 };
 #endif

@ -412,48 +372,6 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
  }
 };

-struct INT32Vec16 : public Vec<INT32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    int32x4x4_t reg;
-    int32_t values[VEC_ELEM_NUM];
-  };
-  int32x4x4_t reg;
-
-  explicit INT32Vec16(const void* ptr) {
-    reg.val[0] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr));
-    reg.val[1] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 4);
-    reg.val[2] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 8);
-    reg.val[3] = vld1q_s32(reinterpret_cast<const int32_t*>(ptr) + 12);
-  }
-
-  void save(int32_t* ptr) const {
-    vst1q_s32(ptr, reg.val[0]);
-    vst1q_s32(ptr + 4, reg.val[1]);
-    vst1q_s32(ptr + 8, reg.val[2]);
-    vst1q_s32(ptr + 12, reg.val[3]);
-  };
-
-  void save(int32_t* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-
-    for (int i = 0; i < full_blocks; i++)
-      vst1q_s32(
-          reinterpret_cast<__int32_t*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
-          reg.val[i]);
-
-    if (remainder > 0) {
-      int32x4_t temp = reg.val[full_blocks];
-      int32_t* base = reinterpret_cast<int32_t*>(ptr) + full_blocks * 4;
-      if (remainder > 0) base[0] = vgetq_lane_s32(temp, 0);
-      if (remainder > 1) base[1] = vgetq_lane_s32(temp, 1);
-      if (remainder > 2) base[2] = vgetq_lane_s32(temp, 2);
-      if (remainder > 3) base[3] = vgetq_lane_s32(temp, 3);
-    }
-  }
-};
-
 struct FP32Vec16 : public Vec<FP32Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  union AliasReg {
@ -516,12 +434,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
    reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
  };
-  explicit FP32Vec16(const INT32Vec16& v) {
-    reg.val[0] = vcvtq_f32_s32(v.reg.val[0]);
-    reg.val[1] = vcvtq_f32_s32(v.reg.val[1]);
-    reg.val[2] = vcvtq_f32_s32(v.reg.val[2]);
-    reg.val[3] = vcvtq_f32_s32(v.reg.val[3]);
-  };
+
  FP32Vec16 operator+(const FP32Vec16& b) const {
    return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]),
                                    vaddq_f32(reg.val[1], b.reg.val[1]),
@ -550,85 +463,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                                    vdivq_f32(reg.val[3], b.reg.val[3])}));
  };

-  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
-    return FP32Vec16(float32x4x4_t(
-        {vminq_f32(max.reg.val[0], vmaxq_f32(min.reg.val[0], reg.val[0])),
-         vminq_f32(max.reg.val[1], vmaxq_f32(min.reg.val[1], reg.val[1])),
-         vminq_f32(max.reg.val[2], vmaxq_f32(min.reg.val[2], reg.val[2])),
-         vminq_f32(max.reg.val[3], vmaxq_f32(min.reg.val[3], reg.val[3]))}));
-  };
-
-  FP32Vec16 max(const FP32Vec16& b) const {
-    return FP32Vec16(float32x4x4_t({vmaxq_f32(b.reg.val[0], reg.val[0]),
-                                    vmaxq_f32(b.reg.val[1], reg.val[1]),
-                                    vmaxq_f32(b.reg.val[2], reg.val[2]),
-                                    vmaxq_f32(b.reg.val[3], reg.val[3])}));
-  };
-
-  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-    float32x4x4_t temp;
-
-    for (int i = 0; i < full_blocks; i++)
-      temp.val[i] = vmaxq_f32(b.reg.val[i], reg.val[i]);
-
-    if (remainder > 0) {
-      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 0),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 0));
-      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 0);
-    }
-    if (remainder > 1) {
-      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 1),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 1));
-      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 1);
-    }
-    if (remainder > 2) {
-      float max_v = std::max(vgetq_lane_f32(reg.val[full_blocks], 2),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 2));
-      temp.val[full_blocks] = vsetq_lane_f32(max_v, temp.val[full_blocks], 2);
-    }
-    return FP32Vec16(temp);
-  };
-
-  FP32Vec16 min(const FP32Vec16& b) const {
-    return FP32Vec16(float32x4x4_t({
-        vminq_f32(b.reg.val[0], reg.val[0]),
-        vminq_f32(b.reg.val[1], reg.val[1]),
-        vminq_f32(b.reg.val[2], reg.val[2]),
-        vminq_f32(b.reg.val[3], reg.val[3]),
-    }));
-  };
-  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    const int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-    float32x4x4_t temp;
-    for (int i = 0; i < full_blocks; i++)
-      temp.val[i] = vminq_f32(b.reg.val[i], reg.val[i]);
-
-    if (remainder > 0) {
-      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 0),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 0));
-      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 0);
-    }
-    if (remainder > 1) {
-      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 1),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 1));
-      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 1);
-    }
-    if (remainder > 2) {
-      float min_v = std::min(vgetq_lane_f32(reg.val[full_blocks], 2),
-                             vgetq_lane_f32(b.reg.val[full_blocks], 2));
-      temp.val[full_blocks] = vsetq_lane_f32(min_v, temp.val[full_blocks], 2);
-    }
-
-    return FP32Vec16(temp);
-  };
-  FP32Vec16 abs() const {
-    return FP32Vec16(
-        float32x4x4_t({vabsq_f32(reg.val[0]), vabsq_f32(reg.val[1]),
-                       vabsq_f32(reg.val[2]), vabsq_f32(reg.val[3])}));
-  }
  float reduce_sum() const {
    AliasReg ar;
    ar.reg = reg;
@ -639,24 +473,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return answer;
  };

-  float reduce_max() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float max_v = std::numeric_limits<float>::lowest();
-    unroll_loop<int, VEC_ELEM_NUM>(
-        [&max_v, &ar](int i) { max_v = std::max(max_v, ar.values[i]); });
-    return max_v;
-  }
-
-  float reduce_min() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float min_v = std::numeric_limits<float>::max();
-    unroll_loop<int, VEC_ELEM_NUM>(
-        [&min_v, &ar](int i) { min_v = std::min(min_v, ar.values[i]); });
-    return min_v;
-  }
-
  template <int group_size>
  float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
@ -677,83 +493,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    vst1q_f32(ptr + 8, reg.val[2]);
    vst1q_f32(ptr + 12, reg.val[3]);
  };
-
-  void save(float* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg.val[0]);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg.val[0]);
-
-    for (int i = 0; i < full_blocks; i++)
-      vst1q_f32(
-          reinterpret_cast<float32_t*>(ptr) + NUM_ELEMENTS_REG(reg.val[0]) * i,
-          reg.val[i]);
-
-    if (remainder > 0) {
-      float32x4_t temp = reg.val[full_blocks];
-      float* base = reinterpret_cast<float32_t*>(ptr) +
-                    full_blocks * NUM_ELEMENTS_REG(reg.val[0]);
-      if (remainder > 0) base[0] = vgetq_lane_f32(temp, 0);
-      if (remainder > 1) base[1] = vgetq_lane_f32(temp, 1);
-      if (remainder > 2) base[2] = vgetq_lane_f32(temp, 2);
-    }
-  }
-};
-
-struct INT8Vec16 : public Vec<INT8Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    int8x16_t reg;
-    int8_t values[VEC_ELEM_NUM];
-  };
-  int8x16_t reg;
-
-  explicit INT8Vec16(const FP32Vec16& vec) {
-    // Convert each 128-bit float32 vector to int32
-    int32x4_t part0 =
-        vcvtq_s32_f32(vec.reg.val[0]);  // Convert first 128-bit block
-    int32x4_t part1 =
-        vcvtq_s32_f32(vec.reg.val[1]);  // Convert second 128-bit block
-    int32x4_t part2 =
-        vcvtq_s32_f32(vec.reg.val[2]);  // Convert third 128-bit block
-    int32x4_t part3 =
-        vcvtq_s32_f32(vec.reg.val[3]);  // Convert fourth 128-bit block
-
-    // Narrow each 32-bit vector to 8 bits and combine
-    int8x8_t lower =
-        vqmovn_s16(vcombine_s16(vqmovn_s32(part0), vqmovn_s32(part1)));
-    int8x8_t upper =
-        vqmovn_s16(vcombine_s16(vqmovn_s32(part2), vqmovn_s32(part3)));
-    reg = vcombine_s8(lower, upper);  // Combine to form a single 128-bit vector
-  }
-
-  void save(int8_t* ptr) const { vst1q_s8(ptr, reg); };
-
-  void save(int8_t* ptr, const int elem_num) const {
-    int full_blocks = elem_num / NUM_ELEMENTS_REG(reg);
-    int remainder = elem_num % NUM_ELEMENTS_REG(reg);
-
-    for (int i = 0; i < full_blocks; i++)
-      vst1q_s8(reinterpret_cast<int8_t*>(ptr) + NUM_ELEMENTS_REG(reg) * i, reg);
-    if (remainder > 0) {
-      int8x16_t temp = reg;
-      int8_t* base =
-          reinterpret_cast<int8_t*>(ptr) + full_blocks * NUM_ELEMENTS_REG(reg);
-      if (remainder > 0) base[0] = vgetq_lane_s8(temp, 0);
-      if (remainder > 1) base[1] = vgetq_lane_s8(temp, 1);
-      if (remainder > 2) base[2] = vgetq_lane_s8(temp, 2);
-      if (remainder > 3) base[3] = vgetq_lane_s8(temp, 3);
-      if (remainder > 4) base[4] = vgetq_lane_s8(temp, 4);
-      if (remainder > 5) base[5] = vgetq_lane_s8(temp, 5);
-      if (remainder > 6) base[6] = vgetq_lane_s8(temp, 6);
-      if (remainder > 7) base[7] = vgetq_lane_s8(temp, 7);
-      if (remainder > 8) base[8] = vgetq_lane_s8(temp, 8);
-      if (remainder > 9) base[9] = vgetq_lane_s8(temp, 9);
-      if (remainder > 10) base[10] = vgetq_lane_s8(temp, 10);
-      if (remainder > 11) base[11] = vgetq_lane_s8(temp, 11);
-      if (remainder > 12) base[12] = vgetq_lane_s8(temp, 12);
-      if (remainder > 13) base[13] = vgetq_lane_s8(temp, 13);
-      if (remainder > 14) base[14] = vgetq_lane_s8(temp, 14);
-    }
-  };
 };

 template <typename T>
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@ -57,7 +57,6 @@ class DNNLPrimitiveHelper {
  // Note: Due to the limitation of oneDNN
  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
  // not supported.
-
  template <typename OutputT, typename BiasT>
  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
@ -91,27 +90,6 @@ class DNNLPrimitiveHelper {
    }

    dnnl::matmul::primitive_desc matmul_pd;
-// Create memory descriptors with format_tag::any for the primitive. This
-// enables the matmul primitive to choose memory layouts for an
-// optimized primitive implementation, and these layouts may differ from the
-// ones provided by the user.
-#ifdef __aarch64__
-    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
-                                         dnnl::memory::format_tag::any);
-    auto mat_weights_md = dnnl::memory::desc(
-        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
-    auto mat_dst_md =
-        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
-    if (bias) {
-      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
-      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
-                                               mat_weights_md, bias_md,
-                                               mat_dst_md, attr);
-    } else {
-      matmul_pd = dnnl::matmul::primitive_desc(
-          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
-    }
-#else
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
@ -120,7 +98,6 @@ class DNNLPrimitiveHelper {
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               c_md, attr);
    }
-#endif
    dnnl::matmul matmul(matmul_pd);

    auto& engine = default_engine();
@ -134,34 +111,24 @@ class DNNLPrimitiveHelper {
                            (void*)b_scales);

    auto& stream = default_stream();
-
-    auto mat_src_mem = a_m;
-    auto mat_weights_mem = b_m;
-    auto mat_dst_mem = c_m;
-#ifdef __aarch64__
-    if (matmul_pd.weights_desc() != b_m.get_desc()) {
-      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
-      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
-    }
-#endif
    if constexpr (InputNoScale) {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
-                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
@ -171,19 +138,19 @@ class DNNLPrimitiveHelper {
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
-                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
@ -203,4 +170,5 @@ class DNNLPrimitiveHelper {
    return stream;
  }
 };
+
 #endif
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@ -36,7 +36,7 @@ struct KernelVecType<c10::Half> {
  using cvt_vec_type = vec_op::FP32Vec16;
 };

-#if defined(__AVX512F__) || defined(__aarch64__)
+#ifdef __AVX512F__
 template <bool AZP, typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
@ -598,9 +598,8 @@ void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
                                   const int num_tokens,
                                   const int hidden_size) {
-  TORCH_CHECK(false,
-              "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 "
-              "support.")
+  TORCH_CHECK(
+      false, "static_scaled_int8_quant_impl requires AVX512/powerpc64 support.")
 }

 template <typename scalar_t>
@ -608,9 +607,9 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, int32_t* azp,
                                    const int num_tokens,
                                    const int hidden_size) {
-  TORCH_CHECK(false,
-              "dynamic_scaled_int8_quant_impl requires "
-              "AVX512/powerpc64/AArch64 support.")
+  TORCH_CHECK(
+      false,
+      "dynamic_scaled_int8_quant_impl requires AVX512/powerpc64 support.")
 }

 template <bool PerChannel, typename scalar_t>
@ -618,8 +617,7 @@ void static_quant_epilogue(const float* input, scalar_t* output,
                           const float a_scale, const float* b_scale,
                           const int32_t* azp_with_adj, const int num_tokens,
                           const int hidden_size) {
-  TORCH_CHECK(
-      false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+  TORCH_CHECK(false, "static_quant_epilogue requires AVX512/powerpc64 support.")
 }

 template <typename scalar_t>
@ -628,9 +626,8 @@ void dynamic_quant_epilogue(const float* input, scalar_t* output,
                            const int32_t* azp, const int32_t* azp_with_adj,
                            const scalar_t* bias, const int num_tokens,
                            const int hidden_size) {
-  TORCH_CHECK(
-      false,
-      "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+  TORCH_CHECK(false,
+              "dynamic_quant_epilogue requires AVX512/powerpc64 support.")
 }
 #endif
 }  // namespace
--- a/csrc/cpu/sgl-kernels/common.h
+++ b/csrc/cpu/sgl-kernels/common.h
@ -58,7 +58,7 @@ namespace {

 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_LAST_DIM_CONTIGUOUS(x) \
-  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")

 #define CHECK_INPUT(x) \
  CHECK_CPU(x);        \
--- a/csrc/cpu/sgl-kernels/gemm.h
+++ b/csrc/cpu/sgl-kernels/gemm.h
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
    int64_t topk,
    int64_t num_tokens_post_pad);

-// shared expert implementation for int8 w8a8
+// shared expert implememntation for int8 w8a8
 template <typename scalar_t>
 void shared_expert_int8_kernel_impl(
    scalar_t* __restrict__ output,
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
    __m512  vd0;
    __m512  vd1[COLS];

-    // oops! 4x4 spills but luckily we use 4x2
+    // oops! 4x4 spills but luckly we use 4x2
    __m512 vbias[COLS];

    // [NOTE]: s8s8 igemm compensation in avx512-vnni
--- a/csrc/cpu/sgl-kernels/vec.h
+++ b/csrc/cpu/sgl-kernels/vec.h
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
 #define CVT_FP16_TO_FP32(a) \
    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))

-// this doesn't handle NaN.
+// this doesn't hanel NaN.
 inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);

--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@ -7,7 +7,7 @@

 namespace {
 #define MAX_SHM_RANK_NUM 8
-#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024)
 static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
 #define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
 #define MIN_THREAD_PROCESS_SIZE (256)
@ -34,10 +34,9 @@ struct KernelVecType<c10::Half> {
 };

 struct ThreadSHMContext {
-  volatile char _curr_thread_stamp[2];
-  volatile char _ready_thread_stamp[2];
-  int local_stamp_buffer_idx;
-  int remote_stamp_buffer_idx;
+  volatile char _curr_thread_stamp;
+  volatile char _ready_thread_stamp;
+  char _padding1[6];
  int thread_id;
  int thread_num;
  int rank;
@ -46,28 +45,23 @@ struct ThreadSHMContext {
  int swizzled_ranks[MAX_SHM_RANK_NUM];
  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
-  size_t _thread_buffer_mask[2];
-  char _padding2[40];
+  size_t _thread_buffer_mask;
+  char _padding2[56];

  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
                   const int group_size, void* thread_shm_ptr)
-      : local_stamp_buffer_idx(0),
-        remote_stamp_buffer_idx(0),
+      : _curr_thread_stamp(1),
+        _ready_thread_stamp(0),
        thread_id(thread_id),
        thread_num(thread_num),
        rank(rank),
        group_size(group_size),
-        _spinning_count(0) {
+        _spinning_count(0),
+        _thread_buffer_mask(0) {
    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
-    _curr_thread_stamp[0] = 1;
-    _curr_thread_stamp[1] = 1;
-    _ready_thread_stamp[0] = 0;
-    _ready_thread_stamp[1] = 0;
-    _thread_buffer_mask[0] = 0;
-    _thread_buffer_mask[1] = 0;
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
      shm_contexts[i] = nullptr;
      thread_shm_ptrs[i] = nullptr;
@ -76,11 +70,6 @@ struct ThreadSHMContext {
    set_context(rank, this, thread_shm_ptr);
  }

-  void set_stamp_buffer_idx(int local, int remote) {
-    local_stamp_buffer_idx = local;
-    remote_stamp_buffer_idx = remote;
-  }
-
  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
    TORCH_CHECK(ptr);
@ -95,27 +84,23 @@ struct ThreadSHMContext {
  T* get_thread_shm_ptr(int rank) {
    return reinterpret_cast<T*>(
        reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
-        (PER_THREAD_SHM_BUFFER_OFFSET &
-         _thread_buffer_mask[local_stamp_buffer_idx]));
+        (PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask));
  }

-  void next_buffer() {
-    _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
-  }
+  void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; }

-  char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
+  char get_curr_stamp() const { return _curr_thread_stamp; }

-  char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
+  char get_ready_stamp() const { return _ready_thread_stamp; }

  void next_stamp() {
    _mm_mfence();
-    _curr_thread_stamp[local_stamp_buffer_idx] += 1;
+    _curr_thread_stamp += 1;
  }

  void commit_ready_stamp() {
    _mm_mfence();
-    _ready_thread_stamp[local_stamp_buffer_idx] =
-        _curr_thread_stamp[local_stamp_buffer_idx];
+    _ready_thread_stamp = _curr_thread_stamp;
  }

  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@ -132,11 +117,10 @@ struct ThreadSHMContext {
  void wait_for_one(int rank, Cond&& cond) {
    ThreadSHMContext* rank_ctx = shm_contexts[rank];
    for (;;) {
-      char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx);
-      char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx);
-      char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx);
-      char rank_ready_stamp =
-          rank_ctx->get_ready_stamp(remote_stamp_buffer_idx);
+      char local_curr_stamp = get_curr_stamp();
+      char local_ready_stamp = get_ready_stamp();
+      char rank_curr_stamp = rank_ctx->get_curr_stamp();
+      char rank_ready_stamp = rank_ctx->get_ready_stamp();
      if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
               rank_ready_stamp)) {
        break;
@ -377,15 +361,6 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
    }
  }
 }
-
-void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local,
-                                    int remote) {
-  int thread_num = ctx->thread_num;
-  for (int i = 0; i < thread_num; ++i) {
-    ThreadSHMContext* thread_ctx = ctx + i;
-    thread_ctx->set_stamp_buffer_idx(local, remote);
-  }
-}
 };  // namespace shm_cc_ops

 namespace shm_cc_ops {
@ -657,7 +632,6 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
  metadata->bind_tensor_list(tensor_list_with_metadata);

-  shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1);
  shm_cc_ops::shm_cc_loop<int8_t>(
      ctx, metadata->total_bytes,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
@ -685,7 +659,6 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
  torch::Tensor metadata_tensor =
      torch::empty({sizeof(TensorListMeta)}, options);

-  shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0);
  ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
                     ctx->get_thread_shm_ptr<void>(src),
@ -704,7 +677,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
      ctx, metadata.total_bytes,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
          int64_t data_elem_num, bool fast_mode) {
-        thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
+        ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
        int64_t curr_shm_offset = 0;
        while (curr_shm_offset < data_elem_num) {
          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -151,9 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);

  // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#ifdef __AVX512F__
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
-
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@ -4,37 +4,10 @@
  #include <hip/hip_runtime.h>
 #endif

-#ifdef USE_ROCM
-struct Utils {
-  static __host__ int get_warp_size() {
-    static bool is_cached = false;
-    static int result;
-
-    if (!is_cached) {
-      int device_id;
-      cudaDeviceProp deviceProp;
-      cudaGetDevice(&device_id);
-      cudaGetDeviceProperties(&deviceProp, device_id);
-
-      result = deviceProp.warpSize;
-      is_cached = true;
-    }
-
-    return result;
-  }
-
-  static __device__ constexpr int get_warp_size() {
-  #ifdef __GFX9__
-    return 64;
-  #else
-    return 32;
-  #endif
-  }
-};
-
-  #define WARP_SIZE Utils::get_warp_size()
-#else
+#ifndef USE_ROCM
  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
 #endif

 #ifndef USE_ROCM
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@ -153,7 +153,7 @@ struct ScaledEpilogueBias
      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;

  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
@ -210,7 +210,7 @@ struct ScaledEpilogueBiasAzp
                                              EVTComputeAzp>;

  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
@ -288,7 +288,7 @@ struct ScaledEpilogueBiasAzpToken
                                              EVTComputeAcc>;

  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@ -195,7 +195,7 @@ struct ScaledEpilogueBias
      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;

  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
@ -238,7 +238,7 @@ struct ScaledEpilogueColumnBias
      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;

  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
@ -295,7 +295,7 @@ struct ScaledEpilogueBiasAzp
      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;

  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
@ -371,7 +371,7 @@ struct ScaledEpilogueBiasAzpToken
      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;

  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::homogeneous_multiply_add, ElementD, float,
+      cutlass::multiply_add, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

 public:
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -45,6 +45,7 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"

 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -15,16 +15,15 @@ namespace vllm {
 // TODO(woosuk): Further optimize this kernel.
 template <typename scalar_t>
 __global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+    const float x = (float)input[blockIdx.x * hidden_size + idx];
    variance += x * x;
  }

@ -38,7 +37,7 @@ __global__ void rms_norm_kernel(
  __syncthreads();

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
+    float x = (float)input[blockIdx.x * hidden_size + idx];
    out[blockIdx.x * hidden_size + idx] =
        ((scalar_t)(x * s_variance)) * weight[idx];
  }
@ -51,8 +50,7 @@ __global__ void rms_norm_kernel(
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 fused_add_rms_norm_kernel(
-    scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ input,         // [..., hidden_size]
    scalar_t* __restrict__ residual,      // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float epsilon, const int num_tokens, const int hidden_size) {
@ -61,7 +59,6 @@ fused_add_rms_norm_kernel(
  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);

  const int vec_hidden_size = hidden_size / width;
-  const int64_t vec_input_stride = input_stride / width;
  __shared__ float s_variance;
  float variance = 0.0f;
  /* These and the argument pointers are all declared `restrict` as they are
@ -76,8 +73,7 @@ fused_add_rms_norm_kernel(

  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
-    int64_t strided_id = blockIdx.x * vec_input_stride + idx;
-    _f16Vec<scalar_t, width> temp = input_v[strided_id];
+    _f16Vec<scalar_t, width> temp = input_v[id];
    temp += residual_v[id];
    variance += temp.sum_squares();
    residual_v[id] = temp;
@ -94,11 +90,10 @@ fused_add_rms_norm_kernel(

  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
-    int64_t strided_id = blockIdx.x * vec_input_stride + idx;
    _f16Vec<scalar_t, width> temp = residual_v[id];
    temp *= s_variance;
    temp *= weight_v[idx];
-    input_v[strided_id] = temp;
+    input_v[id] = temp;
  }
 }

@ -108,8 +103,7 @@ fused_add_rms_norm_kernel(
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 fused_add_rms_norm_kernel(
-    scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ input,         // [..., hidden_size]
    scalar_t* __restrict__ residual,      // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float epsilon, const int num_tokens, const int hidden_size) {
@ -117,7 +111,7 @@ fused_add_rms_norm_kernel(
  float variance = 0.0f;

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    scalar_t z = input[blockIdx.x * input_stride + idx];
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
    z += residual[blockIdx.x * hidden_size + idx];
    float x = (float)z;
    variance += x * x;
@ -135,7 +129,7 @@ fused_add_rms_norm_kernel(

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)residual[blockIdx.x * hidden_size + idx];
-    input[blockIdx.x * input_stride + idx] =
+    input[blockIdx.x * hidden_size + idx] =
        ((scalar_t)(x * s_variance)) * weight[idx];
  }
 }
@ -147,12 +141,11 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
              torch::Tensor& weight,  // [hidden_size]
              double epsilon) {
  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.stride(-1) == 1);
+  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());

  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
-  int64_t input_stride = input.stride(-2);

  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
@ -160,29 +153,26 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
  });
 }

-#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
-  VLLM_DISPATCH_FLOATING_TYPES(                                             \
-      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {               \
-        vllm::fused_add_rms_norm_kernel<scalar_t, width>                    \
-            <<<grid, block, 0, stream>>>(                                   \
-                input.data_ptr<scalar_t>(), input_stride,                   \
-                residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
-                epsilon, num_tokens, hidden_size);                          \
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                       \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {                  \
+        vllm::fused_add_rms_norm_kernel<scalar_t, width>                       \
+            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),           \
+                                         residual.data_ptr<scalar_t>(),        \
+                                         weight.data_ptr<scalar_t>(), epsilon, \
+                                         num_tokens, hidden_size);             \
      });

 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
                        double epsilon) {
-  TORCH_CHECK(residual.is_contiguous());
-  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
-  int64_t input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@ -204,16 +194,9 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
-  constexpr int vector_width = 8;
-  constexpr int req_alignment_bytes =
-      vector_width * 2;  // vector_width * sizeof(bfloat16 or float16) (float32
-                         // falls back to non-vectorized version anyway)
-  bool ptrs_are_aligned = inp_ptr % req_alignment_bytes == 0 &&
-                          res_ptr % req_alignment_bytes == 0 &&
-                          wt_ptr % req_alignment_bytes == 0;
-  bool offsets_are_multiple_of_vector_width =
-      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
+  bool ptrs_are_aligned =
+      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -23,9 +23,8 @@ namespace vllm {
 // TODO(woosuk): Further optimize this kernel.
 template <typename scalar_t, typename fp8_type>
 __global__ void rms_norm_static_fp8_quant_kernel(
-    fp8_type* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int input_stride,
+    fp8_type* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float* __restrict__ scale,      // [1]
    const float epsilon, const int num_tokens, const int hidden_size) {
@ -33,7 +32,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  float variance = 0.0f;

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+    const float x = (float)input[blockIdx.x * hidden_size + idx];
    variance += x * x;
  }

@ -50,7 +49,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  float const scale_inv = 1.0f / *scale;

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
+    float x = (float)input[blockIdx.x * hidden_size + idx];
    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
    out[blockIdx.x * hidden_size + idx] =
        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
@ -64,9 +63,8 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    fp8_type* __restrict__ out,    // [..., hidden_size]
-    scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int input_stride,
+    fp8_type* __restrict__ out,           // [..., hidden_size]
+    scalar_t* __restrict__ input,         // [..., hidden_size]
    scalar_t* __restrict__ residual,      // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float* __restrict__ scale,      // [1]
@ -76,7 +74,6 @@ fused_add_rms_norm_static_fp8_quant_kernel(
  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);

  const int vec_hidden_size = hidden_size / width;
-  const int vec_input_stride = input_stride / width;
  __shared__ float s_variance;
  float variance = 0.0f;
  /* These and the argument pointers are all declared `restrict` as they are
@ -90,9 +87,8 @@ fused_add_rms_norm_static_fp8_quant_kernel(
      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);

  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int stride_id = blockIdx.x * vec_input_stride + idx;
    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16Vec<scalar_t, width> temp = input_v[stride_id];
+    _f16Vec<scalar_t, width> temp = input_v[id];
    temp += residual_v[id];
    variance += temp.sum_squares();
    residual_v[id] = temp;
@ -129,9 +125,8 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    fp8_type* __restrict__ out,    // [..., hidden_size]
-    scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int input_stride,
+    fp8_type* __restrict__ out,           // [..., hidden_size]
+    scalar_t* __restrict__ input,         // [..., hidden_size]
    scalar_t* __restrict__ residual,      // [..., hidden_size]
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float* __restrict__ scale,      // [1]
@ -140,7 +135,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
  float variance = 0.0f;

  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    scalar_t z = input[blockIdx.x * input_stride + idx];
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
    z += residual[blockIdx.x * hidden_size + idx];
    float x = (float)z;
    variance += x * x;
@ -174,9 +169,7 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
                               torch::Tensor& weight,  // [hidden_size]
                               torch::Tensor& scale,   // [1]
                               double epsilon) {
-  TORCH_CHECK(out.is_contiguous());
  int hidden_size = input.size(-1);
-  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@ -190,9 +183,8 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
                  <<<grid, block, 0, stream>>>(
                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      input_stride, weight.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), epsilon, num_tokens,
-                      hidden_size);
+                      weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),
+                      epsilon, num_tokens, hidden_size);
            });
      });
 }
@ -206,7 +198,7 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
                                                               width, fp8_t> \
                  <<<grid, block, 0, stream>>>(                              \
                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),     \
-                      input_stride, residual.data_ptr<scalar_t>(),           \
+                      residual.data_ptr<scalar_t>(),                         \
                      weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),  \
                      epsilon, num_tokens, hidden_size);                     \
            });                                                              \
@ -218,10 +210,7 @@ void fused_add_rms_norm_static_fp8_quant(
    torch::Tensor& weight,    // [hidden_size]
    torch::Tensor& scale,     // [1]
    double epsilon) {
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(residual.is_contiguous());
  int hidden_size = input.size(-1);
-  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@ -245,7 +234,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -0,0 +1,656 @@
+// clang-format off
+// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_fwd.cu 
+// and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "causal_conv1d.h"
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+
+#ifdef USE_ROCM
+    namespace cub = hipcub;
+#endif
+
+#include "static_switch.h"
+
+
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        using weight_t = at::Half;                                                  \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        using weight_t = at::BFloat16;                                              \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        using weight_t = float;                                                     \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+void set_conv_params_fwd(ConvParamsBase &params,
+                         // sizes
+                         const size_t batch,
+                         const size_t dim,
+                         const size_t seqlen,
+                         const size_t width,
+                         // device pointers
+                         const at::Tensor x,
+                         const at::Tensor weight,
+                         const at::Tensor out,
+                         const std::optional<at::Tensor>& bias,
+                         bool silu_activation,
+                         int64_t pad_slot_id,
+                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
+                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
+                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.width = width;
+    params.pad_slot_id = pad_slot_id;
+
+    params.silu_activation = silu_activation;
+
+    // Set the pointers and strides.
+    params.x_ptr = x.data_ptr();
+    params.weight_ptr = weight.data_ptr();
+    params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
+    params.out_ptr = out.data_ptr();
+    // All stride are in elements, not bytes.
+    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
+    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
+    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
+    const bool varlen = params.query_start_loc_ptr != nullptr;
+    params.x_batch_stride = x.stride(varlen ? 1 : 0);
+    params.x_c_stride = x.stride(varlen ? 0 : 1);
+    params.x_l_stride = x.stride(varlen ? 1 : -1);
+    params.weight_c_stride = weight.stride(0);
+    params.weight_width_stride = weight.stride(1);
+    params.out_batch_stride = out.stride(varlen ? 1 : 0);
+    params.out_c_stride = out.stride(varlen ? 0 : 1);
+    params.out_l_stride = out.stride(varlen ? 1 : -1);
+}
+
+
+void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
+                  const std::optional<at::Tensor> &bias_,
+                  const std::optional<at::Tensor> &conv_states,
+                  const std::optional<at::Tensor> &query_start_loc,
+                  const std::optional<at::Tensor> &cache_indices,
+                  const std::optional<at::Tensor> &has_initial_state,
+                  bool silu_activation,
+                 // used to identify padding entries if cache_indices provided
+                 // in case of padding, the kernel will return early
+                  int64_t pad_slot_id) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+    
+    const bool varlen = query_start_loc.has_value() ? true : false;
+    const auto sizes = x.sizes();
+    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
+    const int dim = varlen ? sizes[0] : sizes[1];
+    const int seqlen = varlen ? sizes[1] : sizes[2];
+    const int width = weight.size(-1);
+    if (varlen){
+        CHECK_SHAPE(x, dim, seqlen);
+    }
+    else {
+        CHECK_SHAPE(x, batch_size, dim, seqlen);
+    }
+    CHECK_SHAPE(weight, dim, width);
+
+
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+
+    if (has_initial_state.has_value()) {
+        auto has_initial_state_ = has_initial_state.value();
+        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
+        TORCH_CHECK(has_initial_state_.is_cuda());
+        CHECK_SHAPE(has_initial_state_, batch_size);
+    }
+
+
+    if (query_start_loc.has_value()) {
+        auto query_start_loc_ = query_start_loc.value();
+        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(query_start_loc_.is_cuda());
+    }
+
+
+    if (cache_indices.has_value()) {
+        auto cache_indices_ = cache_indices.value();
+        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(cache_indices_.is_cuda());
+        CHECK_SHAPE(cache_indices_, batch_size);
+    }
+
+    at::Tensor out = x;
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
+                        bias_,
+                        silu_activation, 
+                        pad_slot_id,
+                        query_start_loc,
+                        cache_indices,
+                        has_initial_state
+                        );
+
+    if (conv_states.has_value()) {
+        auto conv_states_ = conv_states.value();
+        TORCH_CHECK(conv_states_.scalar_type() == input_type);
+        TORCH_CHECK(conv_states_.is_cuda());
+        params.conv_states_ptr = conv_states_.data_ptr();
+        params.conv_states_batch_stride = conv_states_.stride(0);
+        params.conv_states_c_stride = conv_states_.stride(1);
+        params.conv_states_l_stride = conv_states_.stride(2);
+    } else {
+        params.conv_states_ptr = nullptr;
+    }
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
+            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
+    });
+}
+
+
+void causal_conv1d_update(const at::Tensor &x,
+                     const at::Tensor &conv_state,
+                     const at::Tensor &weight,
+                     const std::optional<at::Tensor> &bias_,
+                     bool silu_activation,
+                     const std::optional<at::Tensor> &cache_seqlens_,
+                     const std::optional<at::Tensor> &conv_state_indices_,
+                     // used to identify padding entries if cache_indices provided
+                     // in case of padding, the kernel will return early
+                     int64_t pad_slot_id) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == input_type, "weight type must equal to input type, other variations are disabled due to binary size limitations");
+    TORCH_CHECK(conv_state.scalar_type() == input_type);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(conv_state.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+
+    const auto sizes = x.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int width = weight.size(-1);
+    const int conv_state_len = conv_state.size(2);
+    TORCH_CHECK(conv_state_len >= width - 1);
+
+    CHECK_SHAPE(x, batch_size, dim, seqlen);
+    CHECK_SHAPE(weight, dim, width);
+
+    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+    at::Tensor out = x;
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
+                        bias_,
+                        silu_activation,
+                        pad_slot_id);
+    params.conv_state_ptr = conv_state.data_ptr();
+    params.conv_state_len = conv_state_len;
+    // All stride are in elements, not bytes.
+    params.conv_state_batch_stride = conv_state.stride(0);
+    params.conv_state_c_stride = conv_state.stride(1);
+    params.conv_state_l_stride = conv_state.stride(2);
+
+    if (cache_seqlens_.has_value()) {
+        auto cache_seqlens = cache_seqlens_.value();
+        TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
+        TORCH_CHECK(cache_seqlens.is_cuda());
+        TORCH_CHECK(cache_seqlens.stride(-1) == 1);
+        CHECK_SHAPE(cache_seqlens, batch_size);
+        params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
+    } else {
+        params.cache_seqlens = nullptr;
+    }
+
+    if (conv_state_indices_.has_value()) {
+        auto conv_state_indices = conv_state_indices_.value();
+        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
+        TORCH_CHECK(conv_state_indices.is_cuda());
+        TORCH_CHECK(conv_state_indices.stride(0) == 1)
+        CHECK_SHAPE(conv_state_indices, batch_size);
+
+        int conv_state_entries = conv_state.size(0);
+        CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
+
+        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
+    } else {
+        CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
+        params.conv_state_indices_ptr = nullptr;
+    }
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
+            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
+    });
+}
+
+template<int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_fwd_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static_assert(kWidth <= kNElts);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
+    static constexpr int kSmemIOSize = kIsVecLoad
+        ? 0
+        : custom_max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
+    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
+    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_fwd_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
+    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
+
+    const bool kVarlen = params.query_start_loc_ptr != nullptr;
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y;
+    const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
+    const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
+    const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
+
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
+        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
+
+    int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
+        : reinterpret_cast<int *>(params.cache_indices_ptr);
+    int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    // cache_index == params.pad_slot_id is defined as padding, so we exit early
+    if (cache_index == params.pad_slot_id){
+        return;
+    }
+    input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
+        : reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
+
+    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
+    if (tidx == 0) {
+        input_t initial_state[kNElts] = {0};
+        if (has_initial_state) {
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
+        }
+        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
+    }
+
+    float weight_vals[kWidth];
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+
+    constexpr int kChunkSize = kNThreads * kNElts;
+    const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
+    for (int chunk = 0; chunk < n_chunks; ++chunk) {
+        input_t x_vals_load[2 * kNElts] = {0};
+        if constexpr(kIsVecLoad) {
+            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            __syncthreads();
+            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
+        }
+        x += kChunkSize;
+        __syncthreads();
+        // Thread kNThreads - 1 don't write yet, so that thread 0 can read
+        // the last elements of the previous chunk.
+        if (tidx < kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+        __syncthreads();
+        reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
+        __syncthreads();
+        // Now thread kNThreads - 1 can write the last elements of the current chunk.
+        if (tidx == kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+
+        float x_vals[2 * kNElts];
+        #pragma unroll
+        for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
+
+        float out_vals[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) {
+            out_vals[i] = bias_val;
+            #pragma unroll
+            for (int w = 0; w < kWidth; ++w) {
+                out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
+            }
+        }
+
+        if (params.silu_activation) {
+            #pragma unroll
+            for (int i = 0; i < kNElts; ++i) {
+                out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
+            }
+        }
+
+        input_t out_vals_store[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
+        if constexpr(kIsVecLoad) {
+            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
+        }
+        out += kChunkSize;
+
+        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+        // in case the final state is separated between the last "smem_exchange" and 
+        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
+        // (which occurs when `final_state_position` is a non-positive index)
+        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
+            input_t vals_load[kNElts] = {0};
+            if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+                // chunk = n_chunks - 2, a segment of the final state sits in the last index
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
+                #pragma unroll
+                for (int w = 0; w < -final_state_position; ++w){
+                    conv_states[w] = vals_load[kNElts + final_state_position + w];
+                }
+            }
+            if ((chunk == n_chunks - 1) && tidx == 0){
+                // chunk = n_chunks - 1, the second segment of the final state first positions
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
+                for (int w = -final_state_position; w < kWidth - 1; ++w){
+                    conv_states[w] = vals_load[w + final_state_position];
+                }
+                return;
+            }
+        }
+    }
+    // Final state is stored in the smem_exchange last token slot,
+    // in case seqlen < kWidth, we would need to take the final state from the 
+    // initial state which is stored in conv_states
+    // in case seqlen > kWidth, we would need to load the last kWidth - 1 data
+    // and load it into conv_state accordingly
+    int last_thread =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
+    if (conv_states != nullptr && tidx == last_thread) { 
+        input_t x_vals_load[kNElts * 2] = {0};
+        // in case we are on the first kWidth tokens
+        if (last_thread == 0 && seqlen < kWidth){
+            // Need to take the initial state
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
+            const int offset = seqlen - (kWidth - 1);
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                // pad the existing state
+                if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
+                else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
+            }
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                if (offset + w >= 0) 
+                    conv_states[w] = x_vals_load[offset + w ];
+            }
+        }
+        else {
+            // in case the final state is in between the threads data
+            const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+            if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+                // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a 
+                // illegal access error on H100.
+                // Therefore, we access last_thread + 1, only if the final state data sits there
+                reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
+            }
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                conv_states[w] = x_vals_load[offset + w ];
+            }
+        }
+        
+    }
+}
+
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
+    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
+    const bool kVarlen = params.query_start_loc_ptr != nullptr;
+    BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
+        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
+        constexpr int kSmemSize = Ktraits::kSmemSize;
+        dim3 grid(params.batch, params.dim);
+
+        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
+
+        if (kSmemSize >= 48 * 1024) {
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+            std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
+        }
+        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+
+template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+
+
+
+
+template<int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_update_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+};
+
+template<typename Ktraits, bool kIsCircularBuffer>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_update_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y * kNThreads + tidx;
+    if (channel_id >= params.dim) return;
+
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+
+    // If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
+    // along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
+    const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
+        ? batch_id
+        : params.conv_state_indices_ptr[batch_id];
+    // conv_state_batch_coord == params.pad_slot_id is defined as padding so we exit early
+    if (conv_state_batch_coord == params.pad_slot_id){
+        return;
+    }
+    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) 
+        + conv_state_batch_coord * params.conv_state_batch_stride
+        + channel_id * params.conv_state_c_stride;
+
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    int state_len = params.conv_state_len;
+    int advance_len = params.seqlen;
+    int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
+    int update_idx = cache_seqlen - (kWidth - 1);
+    update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
+
+    float weight_vals[kWidth] = {0};
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+
+    float x_vals[kWidth] = {0};
+    if constexpr (!kIsCircularBuffer) {
+        #pragma unroll 2
+        for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
+            conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
+        }
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i) {
+            input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
+            if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
+                conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
+            }
+            x_vals[i] = float(state_val);
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
+            input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
+            x_vals[i] = float(state_val);
+        }
+    }
+    #pragma unroll 2
+    for (int i = 0; i < params.seqlen; ++i) {
+        input_t x_val = x[i * params.x_l_stride];
+        if constexpr (!kIsCircularBuffer) {
+            if (i < advance_len && state_len - advance_len + i >= 0) {
+                conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
+            }
+        } else {
+            conv_state[update_idx * params.conv_state_l_stride] = x_val;
+            ++update_idx;
+            update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
+        }
+        x_vals[kWidth - 1] = float(x_val);
+        float out_val = bias_val;
+        #pragma unroll
+        for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
+        if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
+        out[i * params.out_l_stride] = input_t(out_val);
+        // Shift the input buffer by 1
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
+    }
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
+    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
+    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
+    auto kernel = params.cache_seqlens == nullptr
+        ? &causal_conv1d_update_kernel<Ktraits, false>
+        : &causal_conv1d_update_kernel<Ktraits, true>;
+    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template void causal_conv1d_update_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
--- a/csrc/mamba/causal_conv1d/causal_conv1d.h
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.h
@ -0,0 +1,159 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+// clang-format off
+// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d.h
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ConvParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, width;
+    int64_t pad_slot_id;
+    bool silu_activation;
+
+    index_t x_batch_stride;
+    index_t x_c_stride;
+    index_t x_l_stride;
+    index_t weight_c_stride;
+    index_t weight_width_stride;
+    index_t out_batch_stride;
+    index_t out_c_stride;
+    index_t out_l_stride;
+
+    int conv_state_len;
+    index_t conv_state_batch_stride;
+    index_t conv_state_c_stride;
+    index_t conv_state_l_stride;
+
+    // Common data pointers.
+    void *__restrict__ x_ptr;
+    void *__restrict__ weight_ptr;
+    void *__restrict__ bias_ptr;
+    void *__restrict__ out_ptr;
+
+    void *__restrict__ conv_state_ptr;
+    void *__restrict__ query_start_loc_ptr;
+    void *__restrict__ has_initial_state_ptr;
+    void *__restrict__ cache_indices_ptr;
+    int32_t *__restrict__ cache_seqlens;
+
+    // For the continuous batching case. Makes it so that the mamba state for 
+    // the current batch doesn't need to be a contiguous tensor.
+    int32_t *__restrict__ conv_state_indices_ptr;
+
+    void *__restrict__ seq_idx_ptr;
+
+    // No __restrict__ since initial_states could be the same as final_states.
+    void * initial_states_ptr;
+    index_t initial_states_batch_stride;
+    index_t initial_states_l_stride;
+    index_t initial_states_c_stride;
+
+    void * final_states_ptr;
+    index_t final_states_batch_stride;
+    index_t final_states_l_stride;
+    index_t final_states_c_stride;
+
+    void *  conv_states_ptr;
+    index_t conv_states_batch_stride;
+    index_t conv_states_l_stride;
+    index_t conv_states_c_stride;
+};
+
+
+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+
+    template<typename T>
+    __device__ inline T shuffle_xor(T val, int offset) {
+        return __shfl_xor_sync(uint32_t(-1), val, offset);
+    }
+
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return std::max(ilist);
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return std::min(a, b);
+    }
+
+#else
+    #include <hip/hip_bf16.h>
+
+    template<typename T>
+    __device__ inline T shuffle_xor(T val, int offset) {
+        return __shfl_xor(val, offset);
+    }
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return *std::max_element(ilist.begin(), ilist.end());
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return a < b ? a : b;
+    }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct SumOp {
+__device__ inline T operator()(T const & x, T const & y) { return x + y; }
+};
+
+template<int THREADS>
+struct Allreduce {
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+    template<typename T, typename Operator>
+    static __device__ inline T run(T x, Operator &op) {
+        constexpr int OFFSET = THREADS / 2;
+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+        return Allreduce<OFFSET>::run(x, op);
+    }
+};
+
+template<>
+struct Allreduce<2> {
+template<typename T, typename Operator>
+static __device__ inline T run(T x, Operator &op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+}
+};
--- a/csrc/mamba/causal_conv1d/static_switch.h
+++ b/csrc/mamba/causal_conv1d/static_switch.h
@ -0,0 +1,28 @@
+// Inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+// clang-format off
+// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/static_switch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            static constexpr bool CONST_NAME = true;                                 \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            static constexpr bool CONST_NAME = false;                                \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -7,11 +7,7 @@

 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
-#ifdef USE_ROCM
-    #include <c10/hip/HIPException.h>  // For C10_HIP_CHECK and C10_HIP_KERNEL_LAUNCH_CHECK
-#else
-    #include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
-#endif
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK

 #ifndef USE_ROCM
    #include <cub/block/block_load.cuh>
@ -316,25 +312,19 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    // kIsVariableB, kIsVariableC and kHasZ are all set to True to reduce binary size
    constexpr bool kIsVariableB = true;
    constexpr bool kIsVariableC = true;
+    constexpr bool kHasZ = true;
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
-        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
-            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
-                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
-                dim3 grid(params.batch, params.dim / kNRows);
-                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
-                if (kSmemSize >= 48 * 1024) {
-#ifdef USE_ROCM
-                    C10_HIP_CHECK(hipFuncSetAttribute(
-                        reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-#else
-                    C10_CUDA_CHECK(cudaFuncSetAttribute(
-                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-#endif
-                }
-                kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-            });
+        BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
+            using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+            constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
+            dim3 grid(params.batch, params.dim / kNRows);
+            auto kernel = &selective_scan_fwd_kernel<Ktraits>;
+            if (kSmemSize >= 48 * 1024) {
+                C10_CUDA_CHECK(cudaFuncSetAttribute(
+                    (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+            }
+            kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
        });
    });
 }
@ -622,20 +612,19 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,

    at::Tensor z, out_z;
    const bool has_z = z_.has_value();
-    if (has_z) {
-        z = z_.value();
-        TORCH_CHECK(z.scalar_type() == input_type);
-        TORCH_CHECK(z.is_cuda());
-        TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
-        if (varlen){
-            CHECK_SHAPE(z, dim, seqlen);
-        } else {
-            CHECK_SHAPE(z, batch_size, dim, seqlen);
-        }
-        
-        out_z = z;
+    TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
+    z = z_.value();
+    TORCH_CHECK(z.scalar_type() == input_type);
+    TORCH_CHECK(z.is_cuda());
+    TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
+    if (varlen){
+        CHECK_SHAPE(z, dim, seqlen);
+    } else {
+        CHECK_SHAPE(z, batch_size, dim, seqlen);
    }

+    out_z = z;
+
    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
    at::Tensor out = delta;
    TORCH_CHECK(ssm_states.scalar_type() == input_type);
@ -664,3 +653,4 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
    });
 }
+
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <cub/cub.cuh>

 #include <ATen/ATen.h>
 #include <ATen/cuda/Atomic.cuh>
@ -20,14 +19,9 @@ __global__ void moe_align_block_size_kernel(
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
-    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
+    size_t numel, int32_t* __restrict__ cumsum) {
  extern __shared__ int32_t shared_counts[];

-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
-  }
-
  const int warp_id = threadIdx.x / WARP_SIZE;
  const int my_expert_start = warp_id * experts_per_warp;

@ -51,27 +45,18 @@ __global__ void moe_align_block_size_kernel(

  __syncthreads();

-  // Compute prefix sum over token counts per expert
-  using BlockScan = cub::BlockScan<int32_t, 1024>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      int expert_count = 0;
+      int warp_idx = (i - 1) / experts_per_warp;
+      int expert_offset = (i - 1) % experts_per_warp;
+      expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];

-  int expert_count = 0;
-  int expert_id = threadIdx.x;
-  if (expert_id < num_experts) {
-    int warp_idx = expert_id / experts_per_warp;
-    int expert_offset = expert_id % experts_per_warp;
-    expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
-    expert_count = CEILDIV(expert_count, block_size) * block_size;
-  }
-
-  int cumsum_val;
-  BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
-  if (expert_id <= num_experts) {
-    cumsum[expert_id] = cumsum_val;
-  }
-
-  if (expert_id == num_experts) {
-    *total_tokens_post_pad = cumsum_val;
+      cumsum[i] =
+          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
  }

  __syncthreads();
@ -82,13 +67,6 @@ __global__ void moe_align_block_size_kernel(
      expert_ids[i / block_size] = threadIdx.x;
    }
  }
-
-  // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
-    expert_ids[i] = 0;
-  }
 }

 template <typename scalar_t>
@ -127,12 +105,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[it] = numel;
-  }
-
+    int32_t block_size, size_t numel) {
  const size_t tid = threadIdx.x;
  const size_t stride = blockDim.x;

@ -180,13 +153,6 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
    }
  }

-  // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
-    expert_ids[i] = 0;
-  }
-
  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    int32_t rank_post_pad =
@ -213,17 +179,13 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  int threads = 1024;
  threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;

-  // BlockScan uses 1024 threads and assigns one thread per expert.
-  TORCH_CHECK(padded_num_experts < 1024,
-              "padded_num_experts must be less than 1024");
-
  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
        // calc needed amount of shared mem for `cumsum` tensors
        auto options_int =
            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
        torch::Tensor cumsum_buffer =
-            torch::empty({num_experts + 1}, options_int);
+            torch::zeros({num_experts + 1}, options_int);
        bool small_batch_expert_mode =
            (topk_ids.numel() < 1024) && (num_experts <= 64);

@ -241,7 +203,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
              sorted_token_ids.data_ptr<int32_t>(),
              experts_ids.data_ptr<int32_t>(),
              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0));
+              topk_ids.numel());
        } else {
          auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;

@ -255,8 +217,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
              experts_ids.data_ptr<int32_t>(),
              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
              padded_num_experts, experts_per_warp, block_size,
-              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
-              sorted_token_ids.size(0));
+              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());

          const int block_threads = std::min(256, (int)threads);
          const int num_blocks =
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -10,28 +10,32 @@

 void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
-    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    torch::Tensor& topk_ids,                         // [n_token, topk]
    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    const std::optional<int64_t>& align_block_size,
-    torch::Tensor& permuted_input,             // [permuted_size, hidden]
+    torch::Tensor&
+        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
-    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
-    torch::Tensor& permuted_idx,               // [permute_size]
+    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
    torch::Tensor& m_indices) {                // [align_expand_m]
+  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
+              "topk_weights must be float32");
  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
              "token_expert_indices must be int32");
-  TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
-              "inv_permuted_idx must be int32");
+  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
+              "src_row_id2dst_row_id_map must be int32");
  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
              "expert_first_token_offset shape != n_local_expert+1")
-  TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
-              "token_expert_indices shape must be same as inv_permuted_idx");
+  TORCH_CHECK(
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
  auto align_block_size_value =
@ -42,9 +46,8 @@ void moe_permute(
  auto sort_workspace = torch::empty(
      {sorter_size},
      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
-  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
+  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
  auto align_expert_first_token_offset =
      torch::zeros_like(expert_first_token_offset);

@ -64,22 +67,24 @@ void moe_permute(
    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
    valid_num_ptr =
        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(
-      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
-      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
-      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
-      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
+                    get_ptr<int>(permuted_experts_id),
+                    get_ptr<int>(dst_row_id2src_row_id_map),
+                    get_ptr<int64_t>(expert_first_token_offset), n_token,
+                    n_expert, n_local_expert, topk, sorter,
+                    get_ptr<int>(sort_workspace), stream);

  // dispatch expandInputRowsKernelLauncher
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
-        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
+        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
+        get_ptr<int>(dst_row_id2src_row_id_map),
+        get_ptr<int>(src_row_id2dst_row_id_map),
        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
        n_hidden, topk, n_local_expert, align_block_size_value, stream);
  });
@ -96,34 +101,32 @@ void moe_permute(
 }

 void moe_unpermute(
-    const torch::Tensor& permuted_hidden_states,  // [n_token * topk, hidden]
-    const torch::Tensor& topk_weights,            // [n_token, topk]
-    const torch::Tensor& inv_permuted_idx,        // [n_token, topk]
-    const std::optional<torch::Tensor>&
-        expert_first_token_offset,  // [n_local_expert+1]
-    int64_t topk,
+    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    torch::Tensor& hidden_states  // [n_token, hidden]
 ) {
+  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
+              "topk_ids shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
  TORCH_CHECK(
      permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
-      "permuted_hidden_states dtype must be same as hidden_states");
+      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
  auto n_token = hidden_states.size(0);
  auto n_hidden = hidden_states.size(1);
  auto stream = at::cuda::getCurrentCUDAStream().stream();
-
-  int64_t const* valid_ptr = nullptr;
-  if (expert_first_token_offset.has_value()) {
-    int n_local_expert = expert_first_token_offset.value().size(0) - 1;
-    valid_ptr =
-        get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
-  }
-
+  const int64_t* valid_ptr =
+      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
  MOE_DISPATCH(hidden_states.scalar_type(), [&] {
    finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
        get_ptr<scalar_t>(permuted_hidden_states),
        get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
-        get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
-        stream);
+        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
+        n_token, n_hidden, topk, valid_ptr, stream);
  });
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
  int tidx = threadIdx.x;
  extern __shared__ int64_t smem_expert_first_token_offset[];
  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
+    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
  }
  __syncthreads();
  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@ -57,19 +57,31 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
+    int* expanded_source_row_to_expanded_dest_row,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream);

+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and
+// performs the final skip connection.
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr);
+
 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int64_t const num_rows, int64_t const cols, int64_t const k,
-    int64_t const* num_valid_ptr, cudaStream_t stream);
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream);

 void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                              const int* expert_map_ptr, int num_experts,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@ -2,9 +2,10 @@

 template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
+    int* expanded_source_row_to_expanded_dest_row,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
    int num_local_experts, int align_block_size) {
@ -53,10 +54,6 @@ __global__ void expandInputRowsKernel(
    assert(expanded_dest_row <= INT32_MAX);
    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
        static_cast<int>(expanded_dest_row);
-    // skip non local expert token
-    if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
-      permuted_idx[expanded_dest_row] = expanded_source_row;
-    }
  }

  if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@ -65,7 +62,7 @@ __global__ void expandInputRowsKernel(
    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;

    // Duplicate and permute rows
-    int64_t const source_row = expanded_source_row / k;
+    int64_t const source_row = expanded_source_row % num_rows;

    auto const* source_row_ptr =
        reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@ -85,9 +82,10 @@ __global__ void expandInputRowsKernel(

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
+    int* expanded_source_row_to_expanded_dest_row,
    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@ -107,11 +105,11 @@ void expandInputRowsKernelLauncher(
  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);

  func<<<blocks, threads, smem_size, stream>>>(
-      unpermuted_input, permuted_output, sorted_experts,
+      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
-      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
-      num_local_experts, align_block_size);
+      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
+      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
+      align_block_size);
 }

 template <class T, class U>
@ -130,9 +128,11 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr) {
  assert(orig_cols % 4 == 0);
  int64_t const original_row = blockIdx.x;
+  int64_t const num_rows = gridDim.x;
  auto const offset = original_row * orig_cols;
  OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
  int64_t const num_valid = *num_valid_ptr;
@ -159,13 +159,14 @@ __global__ void finalizeMoeRoutingKernel(
    ComputeElem thread_output;
    thread_output.fill(0);
    for (int k_idx = 0; k_idx < k; ++k_idx) {
-      int64_t const expanded_original_row = original_row * k + k_idx;
+      int64_t const expanded_original_row = original_row + k_idx * num_rows;
      int64_t const expanded_permuted_row =
          expanded_source_row_to_expanded_dest_row[expanded_original_row];

      int64_t const k_offset = original_row * k + k_idx;
      float const row_scale = scales[k_offset];

+      // Check after row_rescale has accumulated
      if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
        continue;
      }
@ -188,8 +189,9 @@ template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int64_t const num_rows, int64_t const cols, int64_t const k,
-    int64_t const* num_valid_ptr, cudaStream_t stream) {
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream) {
  int64_t const blocks = num_rows;
  int64_t const threads = 256;
  bool const check_finished = num_valid_ptr != nullptr;
@ -199,5 +201,6 @@ void finalizeMoeRoutingKernelLauncher(
  auto* const kernel = func_map[check_finished];
  kernel<<<blocks, threads, 0, stream>>>(
      expanded_permuted_rows, reduced_unpermuted_output, scales,
-      expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
+      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
+      num_valid_ptr);
 }
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -190,8 +190,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */

-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
-__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
        int* source_rows, const int k, const int start_expert, const int end_expert)
 {
@ -209,12 +209,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__

    // Restrictions based on previous section.
    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
-    static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+    static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
    static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
-    static_assert(THREADS_PER_ROW <= WARP_SIZE_PARAM, "THREADS_PER_ROW can be at most warp size");
+    static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");

    // We have NUM_EXPERTS elements per row. We specialize for small #experts
-    static constexpr int ELTS_PER_WARP = WARP_SIZE_PARAM * VPT;
+    static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
    static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
    static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;

@ -393,51 +393,41 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 namespace detail
 {
 // Constructs some constants needed to partition the work across threads at compile time.
-template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
+template <int EXPERTS, int BYTES_PER_LDG>
 struct TopkConstants
 {
    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
-    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
-    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
+    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
    static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
-    static const int ROWS_PER_WARP = WARP_SIZE_PARAM / THREADS_PER_ROW;
+    static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
+template <int EXPERTS, int WARPS_PER_TB, typename IndType>
 void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;

    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
-    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

-    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
+    dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }

-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                                \
-    switch (warpSize) {                                                          \
-        case 32:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        case 64:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        default:                                                                 \
-            TORCH_CHECK(false, "Unsupported warp size: ", warpSize);             \
-    }
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
+        gating_output, nullptr, topk_weights, topk_indices,            \
+        token_expert_indices, num_tokens, topk, 0, num_experts,         \
+        stream);

 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@ -451,7 +441,6 @@ void topkGatingSoftmaxKernelLauncher(
    const int topk,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
-    auto warpSize = WARP_SIZE;
    switch (num_experts) {
        case 1:
            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -56,17 +56,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      " -> Tensor");

  m.def(
-      "moe_permute(Tensor input, Tensor topk_ids,"
+      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
-      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
-      "permuted_idx, Tensor! m_indices)->()");
+      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
+      "m_indices)->()");

  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
-      "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
-      "int topk, Tensor! hidden_states)->()");
+      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
+      "expert_first_token_offset, int n_expert, int n_local_expert,int "
+      "topk, Tensor! hidden_states)->()");

  m.def("moe_permute_unpermute_supported() -> bool");
  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -287,16 +287,6 @@ void scaled_fp4_experts_quant(
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts);
-
-void per_token_group_quant_fp8(const torch::Tensor& input,
-                               torch::Tensor& output_q, torch::Tensor& output_s,
-                               int64_t group_size, double eps, double fp8_min,
-                               double fp8_max, bool scale_ue8m0);
-
-void per_token_group_quant_int8(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double int8_min, double int8_max);
 #endif

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
@ -336,6 +326,22 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const std::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);

+void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
+                          const at::Tensor& weight,
+                          const std::optional<at::Tensor>& bias_,
+                          bool silu_activation,
+                          const std::optional<at::Tensor>& cache_seqlens_,
+                          const std::optional<at::Tensor>& conv_state_indices_,
+                          int64_t pad_slot_id);
+
+void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
+                       const std::optional<at::Tensor>& bias_,
+                       const std::optional<at::Tensor>& conv_states,
+                       const std::optional<at::Tensor>& query_start_loc,
+                       const std::optional<at::Tensor>& cache_indices,
+                       const std::optional<at::Tensor>& has_initial_state,
+                       bool silu_activation, int64_t pad_slot_id);
+
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -4,7 +4,7 @@

 #include <cmath>
 #include "core/math.hpp"
-#include "../cuda_compat.h"
+#include "cuda_compat.h"
 #include "dispatch_utils.h"

 #include "quantization/fp8/common.cuh"
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,8 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>

-#include "../per_token_group_quant_8bit.h"
-
 #include <cmath>

 #include "../../dispatch_utils.h"
@ -164,11 +162,10 @@ __global__ void dynamic_scaled_int8_quant_kernel(

  // calculate for absmax
  float thread_max = 0.f;
-  vectorize_read_with_alignment<16>(
-      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
-        const float v = fabsf(static_cast<float>(src));
-        thread_max = fmaxf(thread_max, v);
-      });
+  for (int i = tid; i < hidden_size; i += stride) {
+    const auto v = fabsf(static_cast<float>(row_in[i]));
+    thread_max = fmaxf(thread_max, v);
+  }
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@ -235,10 +232,9 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(

  // 1. calculate min & max
  MinMax thread_mm;
-  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
-                                    [&] __device__(const scalar_t& src) {
-                                      thread_mm += static_cast<float>(src);
-                                    });
+  for (int i = tid; i < hidden_size; i += stride) {
+    thread_mm += static_cast<float>(row_in[i]);
+  }

  using BlockReduce = cub::BlockReduce<MinMax, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
@ -338,11 +334,3 @@ void dynamic_scaled_int8_quant(
        }
      });
 }
-
-void per_token_group_quant_int8(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double int8_min, double int8_max) {
-  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
-                             int8_min, int8_max);
-}
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@ -51,8 +51,7 @@ struct cutlass_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD =
-      128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentCD = 4;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@ -1,373 +0,0 @@
-#include "core/registration.h"
-
-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/gett.hpp"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include <cassert>
-
-using namespace cute;
-
-template <typename ElementAB, typename ElementC, typename ElementAccumulator,
-          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-__global__ void get_ggemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
-    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
-    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
-    ElementAB* b_base_as_int, ElementC* out_base_as_int,
-    ElementAccumulator* a_scale_base_as_int,
-    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
-    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
-  int expert_id = threadIdx.x;
-
-  if (expert_id >= gridDim.x * blockDim.x) {
-    return;
-  }
-
-  int m = problem_sizes[expert_id * 3];
-  int n = problem_sizes[expert_id * 3 + 1];
-  int k = problem_sizes[expert_id * 3 + 2];
-
-  int32_t expert_offset = expert_offsets[expert_id];
-  int a_stride = expert_offset * k;
-  int b_stride = expert_id * k * n;
-  int a_scale_stride = expert_offset * k / 128;
-  int b_scale_stride = expert_id * k * n / 128 / 128;
-
-  a_offsets[expert_id] = a_base_as_int + a_stride;
-  b_offsets[expert_id] = b_base_as_int + b_stride;
-  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
-  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
-  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
-
-  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
-  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
-
-  *layout_sfa_ptr =
-      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-  *layout_sfb_ptr =
-      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
-}
-
-#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
-                                 ScaleConfig)                                 \
-  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
-    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
-                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
-        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
-        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
-        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
-        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
-        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
-        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
-        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
-        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
-        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
-        static_cast<float*>(a_scales.data_ptr()),                             \
-        static_cast<float*>(b_scales.data_ptr()),                             \
-        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
-        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
-        static_cast<int*>(problem_sizes.data_ptr()));                         \
-  }
-
-template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
-void run_get_ggemm_starts(
-    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
-    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
-    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
-    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
-    torch::Tensor out_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
-    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
-  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
-
-  int num_experts = (int)expert_offsets.size(0);
-  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
-
-  if (false) {
-  }
-  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
-                           LayoutSFB, ScaleConfig)
-  else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-}
-
-template <typename OutType, typename ScheduleConfig, typename LayoutD>
-void run_blockwise_scaled_group_mm(
-    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
-    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
-    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
-    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
-    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
-
-  // Types
-  using ElementA = cutlass::float_e4m3_t;
-  using ElementB = cutlass::float_e4m3_t;
-  using ElementC = OutType;
-  using ElementD = ElementC;
-  using ElementAccumulator = float;
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutC = LayoutD;
-
-  // Alignments
-  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
-  using ArchTag = cutlass::arch::Sm100;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
-          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA,
-          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
-          AlignmentA, ElementB,
-          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
-          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
-          typename ScheduleConfig::ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue, void>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
-  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
-  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
-  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
-
-  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
-  int num_experts = (int)expert_offsets.size(0);
-
-  Gemm gemm_op;
-
-  // Mainloop Arguments
-  typename GemmKernel::MainloopArguments mainloop_args{
-      static_cast<const ElementA**>(a_ptrs.data_ptr()),
-      static_cast<StrideA*>(stride_a.data_ptr()),
-      static_cast<const ElementB**>(b_ptrs.data_ptr()),
-      static_cast<StrideB*>(stride_b.data_ptr()),
-      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
-          layout_sfa.data_ptr()),
-      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
-      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
-          layout_sfb.data_ptr())};
-
-  int device_id = a_ptrs.device().index();
-  static const cutlass::KernelHardwareInfo hw_info{
-      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-                     device_id)};
-
-  // Epilogue Arguments
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {},  // epilogue.thread
-      nullptr,
-      static_cast<StrideC*>(stride_c.data_ptr()),
-      static_cast<ElementD**>(out_ptrs.data_ptr()),
-      static_cast<StrideC*>(stride_c.data_ptr())};
-
-  UnderlyingProblemShape* problem_sizes_as_shapes =
-      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
-
-  // Gemm Arguments
-  typename GemmKernel::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      {num_experts, problem_sizes_as_shapes, nullptr},
-      mainloop_args,
-      epilogue_args,
-      hw_info};
-
-  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
-
-  auto can_implement_status = gemm_op.can_implement(args);
-  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
-
-  status = gemm_op.run(stream);
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
-}
-
-template <typename OutType>
-void blockwise_scaled_group_mm_dispatch_shape(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  struct MmaConfig {
-    using ElementA = cutlass::float_e4m3_t;
-    using KernelSchedule =
-        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-    using LayoutC = cutlass::layout::RowMajor;
-    using MmaTileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _1, _1>;
-  };
-
-  int num_experts = (int)expert_offsets.size(0);
-
-  auto a_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto out_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto a_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto b_scales_ptrs = torch::empty(
-      {num_experts},
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  auto layout_sfa = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-  auto layout_sfb = torch::empty(
-      {num_experts, 5},
-      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
-
-  auto stride_a = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_b = torch::full(
-      {num_experts}, a.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-  auto stride_c = torch::full(
-      {num_experts}, output.size(1),
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
-
-  torch::TensorOptions options_int =
-      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
-
-  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
-                       typename MmaConfig::LayoutSFB,
-                       typename MmaConfig::ScaleConfig>(
-      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
-      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
-
-  run_blockwise_scaled_group_mm<OutType, MmaConfig,
-                                typename MmaConfig::LayoutC>(
-      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
-      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
-      expert_offsets);
-}
-
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
-              "a must be kFloat8_e4m3fn");
-  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
-              "b must be kFloat8_e4m3fn");
-  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
-                  output.scalar_type() == torch::kHalf,
-              "output must be bfloat16 or half");
-  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
-              "scales_a must be float32");
-  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
-              "scales_b must be float32");
-  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
-              "expert_offsets must be int32");
-
-  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
-  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
-  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
-  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
-  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
-  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3,
-              "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
-              "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
-              "problem_sizes must be int32");
-  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
-
-#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
-  if (output.scalar_type() == torch::kBFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else if (output.scalar_type() == torch::kFloat16) {
-    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
-        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
-  } else {
-    TORCH_CHECK(false, "Unsupported output tensor type");
-  }
-#endif
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_blockwise_scaled_grouped_mm",
-         &cutlass_blockwise_scaled_grouped_mm);
-}
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
@ -21,49 +21,27 @@ struct sm90_fp8_config_default {
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
-  using ArchTag = cutlass::arch::Sm90;

  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
 };

 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M4 {
-  // M in [1, 4]
+struct sm90_fp8_config_M16 {
+  // M in [1, 16]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
  using EpilogueSchedule =
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
-  using ArchTag = cutlass::arch::Sm90;
+  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;

  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule,
-                            true>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in (4, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule =
-      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
-  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
-  using ArchTag = cutlass::arch::Sm90;
-
-  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule,
-                            true>;
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
 };

 template <typename InType, typename OutType,
@ -77,11 +55,10 @@ struct sm90_fp8_config_K8192 {
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
-  using ArchTag = cutlass::arch::Sm90;

  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
 };

 template <typename InType, typename OutType,
@ -95,11 +72,10 @@ struct sm90_fp8_config_N8192 {
      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
-  using ArchTag = cutlass::arch::Sm90;

  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule>;
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
 };

 template <typename InType, typename OutType>
@ -119,13 +95,14 @@ void run_cutlass_moe_mm_sm90(
  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
              "B tensors must be of type float8_e4m3fn.");

+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+
  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
-      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
+  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
@ -134,18 +111,7 @@ void run_cutlass_moe_mm_sm90(
  uint32_t const n = out_tensors.size(1);
  uint32_t const k = a_tensors.size(1);

-  // Use swap_ab for M <= 64 by default to reduce padding
-  if (m <= 4) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else if (m <= 64) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else if (n >= 8192) {
+  if (n >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
@ -155,6 +121,11 @@ void run_cutlass_moe_mm_sm90(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
        per_out_ch);
+  } else if (m <= 16) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
  } else {
    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@ -18,34 +18,28 @@ using ProblemShape =
    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;

 using ElementAccumulator = float;
+using ArchTag = cutlass::arch::Sm90;
 using OperatorClass = cutlass::arch::OpClassTensorOp;

 using LayoutA = cutlass::layout::RowMajor;
-using LayoutA_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
 using LayoutB = cutlass::layout::ColumnMajor;
-using LayoutB_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
-using LayoutD = cutlass::layout::RowMajor;
-using LayoutD_Transpose =
-    typename cutlass::layout::LayoutTranspose<LayoutD>::type;
-using LayoutC = LayoutD;
-using LayoutC_Transpose = LayoutD_Transpose;
+using LayoutC = cutlass::layout::RowMajor;

-template <typename ElementAB_, typename ElementC_, typename ArchTag_,
+template <typename ElementAB_, typename ElementC_,
          template <typename, typename, typename> typename Epilogue_,
          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule, bool swap_ab_ = false>
+          typename EpilogueSchedule>
 struct cutlass_3x_group_gemm {
-  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = ElementAB_;
  using ElementC = void;
  using ElementD = ElementC_;
  using ElementAccumulator = float;
-  using ArchTag = ArchTag_;

  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;

+  using StrideC =
+      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
+
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
@ -56,28 +50,21 @@ struct cutlass_3x_group_gemm {
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, TileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
-          ElementAccumulator, ElementC,
-          conditional_t<swap_ab, LayoutC_Transpose*, LayoutC*>, AlignmentC,
-          ElementD, conditional_t<swap_ab, LayoutD_Transpose*, LayoutD*>,
-          AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;

  static constexpr size_t CEStorageSize =
      sizeof(typename CollectiveEpilogue::SharedStorage);
  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(CEStorageSize)>;

-  using CollectiveMainloop = conditional_t<
-      swap_ab,
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementAB, LayoutB_Transpose*, AlignmentAB,
-          ElementAB, LayoutA_Transpose*, AlignmentAB, ElementAccumulator,
-          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp,
+  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
-          Stages, KernelSchedule>::CollectiveOp>;
+          Stages, KernelSchedule>::CollectiveOp;

-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
@ -91,12 +78,12 @@ void cutlass_group_gemm_caller(
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch) {
-  static constexpr bool swap_ab = Gemm::swap_ab;
-
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

  int num_experts = static_cast<int>(expert_offsets.size(0));
+  int k_size = a_tensors.size(1);
+  int n_size = out_tensors.size(1);

  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

@ -123,47 +110,26 @@ void cutlass_group_gemm_caller(
          problem_sizes.data_ptr());
  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};

-  typename GemmKernel::MainloopArguments mainloop_args;
-  if constexpr (swap_ab) {
-    mainloop_args = typename GemmKernel::MainloopArguments{
-        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
-        static_cast<StrideB*>(b_strides.data_ptr()),
-        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
-        static_cast<StrideA*>(a_strides.data_ptr())};
-  } else {
-    mainloop_args = typename GemmKernel::MainloopArguments{
-        static_cast<const ElementAB**>(a_ptrs.data_ptr()),
-        static_cast<StrideA*>(a_strides.data_ptr()),
-        static_cast<const ElementAB**>(b_ptrs.data_ptr()),
-        static_cast<StrideB*>(b_strides.data_ptr())};
-  }
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides.data_ptr()),
+      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides.data_ptr())};

  // Currently, we are only able to do broadcast on either all or none a_scales
  // and on either all or none b_scales
  typename GemmKernel::EpilogueArguments epilogue_args{
      Gemm::Epilogue::prepare_args(
-          swap_ab ? static_cast<const ElementAccumulator**>(
-                        b_scales_ptrs.data_ptr())
-                  : static_cast<const ElementAccumulator**>(
-                        a_scales_ptrs.data_ptr()),
-          swap_ab ? static_cast<const ElementAccumulator**>(
-                        a_scales_ptrs.data_ptr())
-                  : static_cast<const ElementAccumulator**>(
-                        b_scales_ptrs.data_ptr()),
-          swap_ab ? per_out_ch : per_act_token,
-          swap_ab ? per_act_token : per_out_ch),
+          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+          per_act_token, per_out_ch),
      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(c_strides.data_ptr())};

-  int device_id = a_tensors.device().index();
-  static const cutlass::KernelHardwareInfo hw_info{
-      device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-                     device_id)};
-
  typename GemmKernel::Arguments args{
      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
-      epilogue_args, hw_info};
+      epilogue_args};

  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
  GemmOp gemm_op;
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
@ -1,140 +0,0 @@
-#include <cudaTypedefs.h>
-
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
-
-#include "cutlass/cutlass.h"
-#include "grouped_mm_c3x.cuh"
-
-using namespace cute;
-
-namespace {
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_default {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
-  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
-  using ArchTag = cutlass::arch::Sm100;
-
-  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M64 {
-  // M in [1,64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
-  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
-  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
-  using ArchTag = cutlass::arch::Sm100;
-
-  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule,
-                            true>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_N8192 {
-  // N in [8192, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
-  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
-  using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
-  using ArchTag = cutlass::arch::Sm100;
-
-  using Cutlass3xGemm =
-      cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
-                            ClusterShape, KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType>
-void run_cutlass_moe_mm_sm100(
-    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
-    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
-    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
-  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
-  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
-  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
-
-  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
-              "A tensors must be of type float8_e4m3fn.");
-  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
-              "B tensors must be of type float8_e4m3fn.");
-
-  using Cutlass3xGemmDefault = typename sm100_fp8_config_default<
-      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmN8192 = typename sm100_fp8_config_N8192<
-      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 = typename sm100_fp8_config_M64<
-      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-
-  uint32_t const m = a_tensors.size(0);
-  uint32_t const n = out_tensors.size(1);
-
-  if (m <= 64) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else if (n >= 8192) {
-    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else {
-    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  }
-}
-}  // namespace
-
-void dispatch_moe_mm_sm100(
-    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
-    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
-    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
-  if (out_tensors.dtype() == torch::kBFloat16) {
-    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  } else {
-    run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::half_t>(
-        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
-  }
-}
-
-void cutlass_moe_mm_sm100(
-    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
-    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
-    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
-  dispatch_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
-                        expert_offsets, problem_sizes, a_strides, b_strides,
-                        c_strides, per_act_token, per_out_ch);
-}
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@ -6,11 +6,8 @@
 #include <iostream>

 constexpr uint64_t THREADS_PER_EXPERT = 512;
-// threshold must match the dispatch logic in run_cutlass_moe_mm_sm90()
-constexpr int SWAP_AB_THRESHOLD = 64;

-template <bool SWAP_AB>
-__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
                                      int32_t* problem_sizes1,
                                      int32_t* problem_sizes2,
                                      int32_t* atomic_buffer,
@ -27,56 +24,45 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,

  if (threadIdx.x == 0) {
    int final_occurrences = atomic_buffer[expert_id];
-    if constexpr (!SWAP_AB) {
-      problem_sizes1[expert_id * 3] = final_occurrences;
-      problem_sizes1[expert_id * 3 + 1] = 2 * n;
-      problem_sizes1[expert_id * 3 + 2] = k;
-      problem_sizes2[expert_id * 3] = final_occurrences;
-      problem_sizes2[expert_id * 3 + 1] = k;
-      problem_sizes2[expert_id * 3 + 2] = n;
-    } else {
-      problem_sizes1[expert_id * 3] = 2 * n;
-      problem_sizes1[expert_id * 3 + 1] = final_occurrences;
-      problem_sizes1[expert_id * 3 + 2] = k;
-      problem_sizes2[expert_id * 3] = k;
-      problem_sizes2[expert_id * 3 + 1] = final_occurrences;
-      problem_sizes2[expert_id * 3 + 2] = n;
-    }
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = 2 * n;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = k;
+    problem_sizes2[expert_id * 3 + 2] = n;
  }
 }

 __global__ void compute_expert_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* atomic_buffer, const int num_experts, const bool swap_ab) {
+    int32_t* atomic_buffer, const int num_experts) {
  int32_t tot_offset = 0;
  expert_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
    atomic_buffer[i] = tot_offset;
-    tot_offset += swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
+    tot_offset += problem_sizes1[i * 3];
    expert_offsets[i + 1] = tot_offset;
  }
 }

 __global__ void compute_expert_blockscale_offsets(
    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
-    const bool swap_ab) {
+    int32_t* blockscale_offsets, int32_t* atomic_buffer,
+    const int num_experts) {
  int32_t tot_offset = 0;
  int32_t tot_offset_round = 0;
  expert_offsets[0] = 0;
  blockscale_offsets[0] = 0;
  for (int i = 0; i < num_experts; ++i) {
-    int32_t cur_offset =
-        swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
    atomic_buffer[i] = tot_offset;
-    tot_offset += cur_offset;
+    tot_offset += problem_sizes1[i * 3];
    expert_offsets[i + 1] = tot_offset;
-    tot_offset_round += (cur_offset + (128 - 1)) / 128 * 128;
+    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
    blockscale_offsets[i + 1] = tot_offset_round;
  }
 }

-__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
+__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
                                  const int32_t* __restrict__ expert_offsets,
                                  int32_t* input_permutation,
                                  int32_t* output_permutation,
@ -116,44 +102,25 @@ void get_cutlass_moe_mm_data_caller(
  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);

  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
-
-  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
-                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
-
-  if (may_swap_ab) {
-    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  } else {
-    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  }
-
+  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
  if (blockscale_offsets.has_value()) {
-    // fp4 path
    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
-        may_swap_ab);
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  } else {
    compute_expert_offsets<<<1, 1, 0, stream>>>(
        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
        static_cast<int32_t*>(expert_offsets.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
-        may_swap_ab);
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  }
  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
      static_cast<const int32_t*>(expert_offsets.data_ptr()),
      static_cast<int32_t*>(input_permutation.data_ptr()),
      static_cast<int32_t*>(output_permutation.data_ptr()),
@ -193,4 +160,4 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
      static_cast<int32_t*>(problem_sizes2.data_ptr()),
      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
      k);
-}
+}
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -41,16 +41,6 @@ void cutlass_moe_mm_sm90(

 #endif

-#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-void cutlass_moe_mm_sm100(
-    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
-    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
-    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch);
-#endif
-
 #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
 void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
@ -140,10 +130,10 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
  // and at least SM90 (Hopper)

 #if defined CUDA_VERSION
-  if (cuda_device_capability >= 100) {
-    return CUDA_VERSION >= 12080;
-  } else if (cuda_device_capability >= 90) {
+  if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 100) {
+    return CUDA_VERSION >= 12080;
  }
 #endif

@ -151,14 +141,11 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
 }

 bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
-  // CUTLASS grouped FP8 kernels need at least CUDA 12.3 and SM90 (Hopper)
-  // or CUDA 12.8 and SM100 (Blackwell)
+  // CUTLASS grouped FP8 kernels need at least CUDA 12.3
+  // and SM90 (Hopper)

 #if defined CUDA_VERSION
-  if (cuda_device_capability >= 100) {
-    return CUDA_VERSION >= 12080;
-  }
-  if (cuda_device_capability >= 90) {
+  if (cuda_device_capability == 90) {
    return CUDA_VERSION >= 12030;
  }
 #endif
@ -247,26 +234,16 @@ void cutlass_moe_mm(
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
-#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100) {
-    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
-                         expert_offsets, problem_sizes, a_strides, b_strides,
-                         c_strides, per_act_token, per_out_ch);
-    return;
-  }
-#endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90) {
-    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
-                        expert_offsets, problem_sizes, a_strides, b_strides,
-                        c_strides, per_act_token, per_out_ch);
-    return;
-  }
+  cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                      expert_offsets, problem_sizes, a_strides, b_strides,
+                      c_strides, per_act_token, per_out_ch);
+  return;
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
-      ". Required capability: 90 or 100");
+      ". Required capability: 90");
 }

 void get_cutlass_moe_mm_data(
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@ -30,40 +30,35 @@

 #include "cutlass/util/packed_stride.hpp"

-#include "core/math.hpp"
-
 using namespace cute;

 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+// Kernel Perf config
+template <typename T>
+struct KernelTraits;

-// Configuration for M in (256, inf)
-struct sm100_fp4_config_default {
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_256, _256, _256>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
-};
-
-// Configuration for M in (16, 256]
-struct sm100_fp4_config_M256 {
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_256, _128, _256>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
-};
-
-// Configuration for M in [1, 16]
-struct sm100_fp4_config_M16 {
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_128, _128, _256>;
+template <>
+struct KernelTraits<float> {
+  using MmaTileShape = Shape<_128, _128, _256>;
  using ClusterShape = Shape<_1, _1, _1>;
  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
 };

-template <typename Config, typename OutType>
+template <>
+struct KernelTraits<cutlass::half_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::bfloat16_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <typename T>
 struct Fp4GemmSm100 {
  // A matrix configuration
  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
@ -76,22 +71,21 @@ struct Fp4GemmSm100 {
  static constexpr int AlignmentB = 32;

  // C/D matrix configuration
-  using ElementD = OutType;
-  using ElementC = OutType;
+  using ElementD = T;
+  using ElementC = T;
  using LayoutCTag = cutlass::layout::RowMajor;
  using LayoutDTag = cutlass::layout::RowMajor;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-
  // Kernel functional config
  using ElementAccumulator = float;
  using ArchTag = cutlass::arch::Sm100;
  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;

-  // Use config's tile shapes
-  using MmaTileShape = typename Config::TileShape;
-  using ClusterShape = typename Config::ClusterShape;
-  using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
+  // Kernel Perf config
+  using MmaTileShape = typename KernelTraits<T>::MmaTileShape;
+  using ClusterShape = typename KernelTraits<T>::ClusterShape;
+  using PerSmTileShape_MNK = typename KernelTraits<T>::PerSmTileShape_MNK;

  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
@ -125,22 +119,22 @@ struct Fp4GemmSm100 {
  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
 };

-template <typename Config>
-typename Config::Gemm::Arguments args_from_options(
+template <typename T>
+typename T::Gemm::Arguments args_from_options(
    at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
    at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
    int64_t M, int64_t N, int64_t K) {
-  using ElementA = typename Config::Gemm::ElementA;
-  using ElementB = typename Config::Gemm::ElementB;
+  using ElementA = typename T::Gemm::ElementA;
+  using ElementB = typename T::Gemm::ElementB;
  using ElementSFA = cutlass::float_ue4m3_t;
  using ElementSFB = cutlass::float_ue4m3_t;
-  using ElementD = typename Config::Gemm::ElementD;
+  using ElementD = typename T::Gemm::ElementD;
  using ElementCompute = float;
-  using StrideA = typename Config::StrideA;
-  using StrideB = typename Config::StrideB;
-  using StrideD = typename Config::StrideD;
-  using Sm100BlkScaledConfig = typename Config::Gemm::GemmKernel::
-      CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using StrideA = typename T::StrideA;
+  using StrideB = typename T::StrideB;
+  using StrideD = typename T::StrideD;
+  using Sm100BlkScaledConfig =
+      typename T::Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;

  int m = static_cast<int>(M);
  int n = static_cast<int>(N);
@ -154,7 +148,7 @@ typename Config::Gemm::Arguments args_from_options(
  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
      cute::make_shape(m, n, k, 1));

-  typename Config::Gemm::Arguments arguments{
+  typename T::Gemm::Arguments arguments{
      cutlass::gemm::GemmUniversalMode::kGemm,
      {m, n, k, 1},
      {// Mainloop arguments
@ -173,17 +167,17 @@ typename Config::Gemm::Arguments args_from_options(
  return arguments;
 }

-template <typename Config>
+template <typename T>
 void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
             at::Tensor const& A_sf, at::Tensor const& B_sf,
             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
             cudaStream_t stream) {
-  typename Config::Gemm gemm;
+  typename Fp4GemmSm100<T>::Gemm gemm;

  auto arguments =
-      args_from_options<Config>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+      args_from_options<Fp4GemmSm100<T>>(D, A, B, A_sf, B_sf, alpha, m, n, k);

-  size_t workspace_size = Config::Gemm::get_workspace_size(arguments);
+  size_t workspace_size = Fp4GemmSm100<T>::Gemm::get_workspace_size(arguments);
  auto const workspace_options =
      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
  auto workspace = torch::empty(workspace_size, workspace_options);
@ -194,40 +188,12 @@ void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,

  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
 }
-
-// Dispatch function to select appropriate config based on M
-template <typename OutType>
-void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
-                               torch::Tensor const& B,
-                               torch::Tensor const& A_sf,
-                               torch::Tensor const& B_sf,
-                               torch::Tensor const& alpha, int64_t m, int64_t n,
-                               int64_t k, cudaStream_t stream) {
-  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
-
-  if (mp2 <= 16) {
-    // m in [1, 16]
-    runGemm<Fp4GemmSm100<sm100_fp4_config_M16, OutType>>(
-        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
-  } else if (mp2 <= 256) {
-    // m in (16, 256]
-    runGemm<Fp4GemmSm100<sm100_fp4_config_M256, OutType>>(
-        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
-  } else {
-    // m in (256, inf)
-    runGemm<Fp4GemmSm100<sm100_fp4_config_default, OutType>>(
-        D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
-  }
-}
-
 #else
-template <typename OutType>
-void cutlass_fp4_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
-                               torch::Tensor const& B,
-                               torch::Tensor const& A_sf,
-                               torch::Tensor const& B_sf,
-                               torch::Tensor const& alpha, int64_t m, int64_t n,
-                               int64_t k, cudaStream_t stream) {
+template <typename T>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
  TORCH_CHECK(false,
              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
              "a CUTLASS 3.8 source directory to enable support.");
@ -305,13 +271,12 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());

  if (out_dtype == at::ScalarType::Half) {
-    cutlass_fp4_gemm_dispatch<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n,
-                                               k, stream);
+    runGemm<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
  } else if (out_dtype == at::ScalarType::BFloat16) {
-    cutlass_fp4_gemm_dispatch<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha,
-                                                   m, n, k, stream);
+    runGemm<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::Float) {
+    runGemm<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
  } else {
-    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm (", out_dtype,
-                ")");
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
  }
 }
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -88,8 +88,6 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                             torch::Tensor const& input,  // [..., d]
                             torch::Tensor const& scale)  // [1]
 {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
  int const block_size = 256;
  int const num_tokens = input.numel() / input.size(-1);
  int const num_elems = input.numel();
@ -113,8 +111,6 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor& scale)        // [1]
 {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
  int const block_size = 256;
  int const num_tokens = input.numel() / input.size(-1);
  int const num_elems = input.numel();
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@ -1,217 +0,0 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/util/Float8_e4m3fn.h>
-
-#include "../per_token_group_quant_8bit.h"
-
-#include <cmath>
-
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
-#include <torch/all.h>
-
-#include "../vectorization.cuh"
-#include "../vectorization_utils.cuh"
-#include "../../dispatch_utils.h"
-
-__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
-  unsigned mask = 0xffff;
-
-  val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
-  val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
-  val = fmaxf(val, __shfl_xor_sync(mask, val, 2));
-  val = fmaxf(val, __shfl_xor_sync(mask, val, 1));
-  return val;
-}
-
-template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
-          bool SCALE_UE8M0 = false, typename scale_packed_t = float>
-__global__ void per_token_group_quant_8bit_kernel(
-    const T* __restrict__ input, void* __restrict__ output_q,
-    scale_packed_t* __restrict__ output_s, const int group_size,
-    const int num_groups, const int groups_per_block, const float eps,
-    const float min_8bit, const float max_8bit, const int scale_num_rows = 0,
-    const int scale_stride = 0) {
-  const int threads_per_group = 16;
-  const int64_t local_group_id = threadIdx.x / threads_per_group;
-  const int lane_id = threadIdx.x % threads_per_group;
-
-  const int64_t block_group_id = blockIdx.x * groups_per_block;
-  const int64_t global_group_id = block_group_id + local_group_id;
-  const int64_t block_group_offset = global_group_id * group_size;
-
-  float local_absmax = eps;
-
-  using scale_element_t = float;
-  static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
-
-  const T* group_input = input + block_group_offset;
-  DST_DTYPE* group_output =
-      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
-  scale_element_t* scale_output;
-
-  if constexpr (IS_COLUMN_MAJOR) {
-    const int num_elems_per_pack =
-        static_cast<int>(sizeof(scale_packed_t) / sizeof(scale_element_t));
-    const int scale_num_rows_element = scale_num_rows * num_elems_per_pack;
-    const int row_idx = global_group_id / scale_num_rows_element;
-    const int col_idx_raw = global_group_id % scale_num_rows_element;
-    const int col_idx = col_idx_raw / num_elems_per_pack;
-    const int pack_idx = col_idx_raw % num_elems_per_pack;
-    scale_output = reinterpret_cast<scale_element_t*>(output_s) +
-                   (col_idx * scale_stride * num_elems_per_pack +
-                    row_idx * num_elems_per_pack + pack_idx);
-  } else {
-    scale_output = output_s + global_group_id;
-  }
-
-  // shared memory to cache each group's data to avoid double DRAM reads.
-  extern __shared__ __align__(16) char smem_raw[];
-  T* smem = reinterpret_cast<T*>(smem_raw);
-  T* smem_group = smem + local_group_id * group_size;
-
-  constexpr int vec_size = 16 / sizeof(T);
-  using vec_t = vllm::vec_n_t<T, vec_size>;
-
-  // copy global -> shared & compute absmax
-  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
-    float abs_v = fabsf(static_cast<float>(src));
-    local_absmax = fmaxf(local_absmax, abs_v);
-    dst = src;
-  };
-
-  vllm::vectorize_with_alignment<vec_size>(
-      group_input,        // in
-      smem_group,         // out (shared)
-      group_size,         // elements per group
-      lane_id,            // thread id
-      threads_per_group,  // stride in group
-      scalar_op_cache);   // scalar handler
-
-  local_absmax = GroupReduceMax(local_absmax, lane_id);
-
-  float y_s = local_absmax / max_8bit;
-  if constexpr (SCALE_UE8M0) {
-    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
-  }
-
-  scale_element_t y_s_quant = y_s;
-
-  if (lane_id == 0) {
-    *scale_output = y_s_quant;
-  }
-
-  __syncthreads();
-
-  // quantize shared -> global 8-bit
-  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
-    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
-    dst = DST_DTYPE(q);
-  };
-
-  vllm::vectorize_with_alignment<vec_size>(
-      smem_group,         // in (shared)
-      group_output,       // out (global quant tensor)
-      group_size,         // elements
-      lane_id,            // tid
-      threads_per_group,  // stride
-      scalar_op_quant);   // scalar handler
-}
-
-void per_token_group_quant_8bit(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double min_8bit, double max_8bit,
-                                bool scale_ue8m0) {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(output_q.is_contiguous());
-
-  const int num_groups = input.numel() / group_size;
-
-  TORCH_CHECK(input.numel() % group_size == 0);
-  TORCH_CHECK(output_s.dim() == 2);
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  constexpr int THREADS_PER_GROUP = 16;
-
-  int groups_per_block = 1;
-
-  if (num_groups % 16 == 0) {
-    groups_per_block = 16;
-  } else if (num_groups % 8 == 0) {
-    groups_per_block = 8;
-  } else if (num_groups % 4 == 0) {
-    groups_per_block = 4;
-  } else if (num_groups % 2 == 0) {
-    groups_per_block = 2;
-  }
-
-  auto dst_type = output_q.scalar_type();
-  const int num_blocks = num_groups / groups_per_block;
-  const int num_threads = groups_per_block * THREADS_PER_GROUP;
-
-  const bool is_column_major = output_s.stride(0) < output_s.stride(1);
-  const int scale_num_rows = output_s.size(1);
-  const int scale_stride = output_s.stride(1);
-
-#define LAUNCH_KERNEL(T, DST_DTYPE)                                        \
-  do {                                                                     \
-    dim3 grid(num_blocks);                                                 \
-    dim3 block(num_threads);                                               \
-    size_t smem_bytes =                                                    \
-        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);    \
-    if (is_column_major) {                                                 \
-      if (scale_ue8m0) {                                                   \
-        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, true>        \
-            <<<grid, block, smem_bytes, stream>>>(                         \
-                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
-                static_cast<float*>(output_s.data_ptr()), group_size,      \
-                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
-                (float)max_8bit, scale_num_rows, scale_stride);            \
-      } else {                                                             \
-        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, false>       \
-            <<<grid, block, smem_bytes, stream>>>(                         \
-                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
-                static_cast<float*>(output_s.data_ptr()), group_size,      \
-                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
-                (float)max_8bit, scale_num_rows, scale_stride);            \
-      }                                                                    \
-    } else {                                                               \
-      if (scale_ue8m0) {                                                   \
-        per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, true>       \
-            <<<grid, block, smem_bytes, stream>>>(                         \
-                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
-                static_cast<float*>(output_s.data_ptr()), group_size,      \
-                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
-                (float)max_8bit);                                          \
-      } else {                                                             \
-        per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, false>      \
-            <<<grid, block, smem_bytes, stream>>>(                         \
-                static_cast<T*>(input.data_ptr()), output_q.data_ptr(),    \
-                static_cast<float*>(output_s.data_ptr()), group_size,      \
-                num_groups, groups_per_block, (float)eps, (float)min_8bit, \
-                (float)max_8bit);                                          \
-      }                                                                    \
-    }                                                                      \
-  } while (0)
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "per_token_group_quant_8bit", ([&] {
-        if (dst_type == at::ScalarType::Float8_e4m3fn) {
-          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
-        } else if (dst_type == at::ScalarType::Char) {
-          LAUNCH_KERNEL(scalar_t, int8_t);
-        }
-      }));
-
-#undef LAUNCH_KERNEL
-}
-
-void per_token_group_quant_fp8(const torch::Tensor& input,
-                               torch::Tensor& output_q, torch::Tensor& output_s,
-                               int64_t group_size, double eps, double fp8_min,
-                               double fp8_max, bool scale_ue8m0) {
-  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
-                             fp8_min, fp8_max, scale_ue8m0);
-}
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -4,7 +4,7 @@
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>

-#include "../../cuda_compat.h"
+#include "cuda_compat.h"
 #include "dispatch_utils.h"

 #include "ggml-common.h"
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -38,6 +38,7 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
 #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@ -187,12 +187,8 @@ struct PrepackedLayoutBTemplate {
  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
      Shape_NKL shape_mkl) {
    auto layout = TVbNbKL_to_offset(shape_mkl);
-    // for 4-bit elements, having >= 64 values per column
-    // allows TMA to load full 32-byte sectors
-    auto inner_layout =
-        make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
-
-    return make_layout(inner_layout, get<1>(layout), get<2>(layout));
+    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
+                       get<2>(layout));
  }

  // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
--- a/csrc/quantization/per_token_group_quant_8bit.h
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@ -1,10 +0,0 @@
-#pragma once
-#include <torch/all.h>
-
-// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
-// 8-bit per-token-group quantization helper used by both FP8 and INT8
-void per_token_group_quant_8bit(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double min_8bit, double max_8bit,
-                                bool scale_ue8m0 = false);
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@ -27,26 +27,6 @@ __device__ inline void vectorize_with_alignment(
  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
  uintptr_t addr = reinterpret_cast<uintptr_t>(in);

-  // fast path when the whole region is already aligned
-  // Note: currently the output is guaranteed to be same as the input, so we
-  // don't check it here, comments here just for future reference.
-  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
-  if (can_vec) {
-    int num_vec = len / VEC_SIZE;
-
-    using vin_t = vec_n_t<InT, VEC_SIZE>;
-    using vout_t = vec_n_t<OutT, VEC_SIZE>;
-    auto* v_in = reinterpret_cast<const vin_t*>(in);
-    auto* v_out = reinterpret_cast<vout_t*>(out);
-
-    for (int i = tid; i < num_vec; i += stride) {
-      vout_t tmp;
-      vec_op(tmp, v_in[i]);
-      v_out[i] = tmp;
-    }
-    return;
-  }
-
  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
@ -92,81 +72,4 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
                                     std::forward<ScaOp>(scalar_op));
 }

-template <int VEC_SIZE, typename InT, typename ScaOp>
-struct DefaultReadVecOp {
-  ScaOp scalar_op;
-
-  __device__ __forceinline__ void operator()(
-      const vec_n_t<InT, VEC_SIZE>& src) const {
-#pragma unroll
-    for (int i = 0; i < VEC_SIZE; ++i) {
-      scalar_op(src.val[i]);
-    }
-  }
-};
-
-// read-only version: iterate over the input with alignment guarantees
-template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
-__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
-                                                     int tid, int stride,
-                                                     VecOp&& vec_op,
-                                                     ScaOp&& scalar_op) {
-  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
-                "VEC_SIZE must be a positive power-of-two");
-  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
-  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
-
-  // fast path when the whole region is already aligned
-  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
-  if (can_vec) {
-    int num_vec = len / VEC_SIZE;
-
-    using vin_t = vec_n_t<InT, VEC_SIZE>;
-    auto* v_in = reinterpret_cast<const vin_t*>(in);
-
-    for (int i = tid; i < num_vec; i += stride) {
-      vec_op(v_in[i]);
-    }
-    return;
-  }
-
-  int misalignment_offset = addr & (WIDTH - 1);
-  int alignment_bytes = WIDTH - misalignment_offset;
-  int prefix_elems = alignment_bytes & (WIDTH - 1);
-  prefix_elems /= sizeof(InT);
-  prefix_elems = min(prefix_elems, len);
-
-  // 1. handle the possibly unaligned prefix with scalar access.
-  for (int i = tid; i < prefix_elems; i += stride) {
-    scalar_op(in[i]);
-  }
-
-  in += prefix_elems;
-  len -= prefix_elems;
-
-  int num_vec = len / VEC_SIZE;
-  using vin_t = vec_n_t<InT, VEC_SIZE>;
-  auto* v_in = reinterpret_cast<const vin_t*>(in);
-
-  // 2. vectorized traversal of the main aligned region.
-  for (int i = tid; i < num_vec; i += stride) {
-    vec_op(v_in[i]);
-  }
-
-  // 3. handle remaining tail elements.
-  int tail_start = num_vec * VEC_SIZE;
-  for (int i = tid + tail_start; i < len; i += stride) {
-    scalar_op(in[i]);
-  }
-}
-
-// overload that requires only a scalar_op
-template <int VEC_SIZE, typename InT, typename ScaOp>
-__device__ __forceinline__ void vectorize_read_with_alignment(
-    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
-  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
-  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
-                                          std::forward<ScaOp>(scalar_op));
-}
-
 }  // namespace vllm
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -19,7 +19,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <hip/hip_fp8.h>
 #include <hip/hip_bf16.h>
-#include "../cuda_compat.h"
+#include "cuda_compat.h"

 #include <algorithm>
 #include "../attention/dtype_fp8.cuh"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Shaw	7d092fc32c	revert skip-merge-desc Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 20:30:45 +00:00
Robert Shaw	1a6c27f271	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 20:29:33 +00:00
Robert Shaw	3c6fd286b4	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-03 18:29:58 +00:00