Merge remote-tracking branch 'origin/main' into fp8_ep_dp

hacks
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-02 14:53:27 -04:00 · 2025-05-30 02:33:58 +00:00 · 2025-05-30 02:08:21 +00:00 · 2025-05-30 00:12:54 +00:00 · 2025-05-29 21:25:33 +00:00 · 2025-05-29 18:50:37 +00:00
1546 changed files with 3940 additions and 16242 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import sys
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import os
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path

 import pytest
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
 import os
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from lmdeploy.serve.openai.api_client import APIClient

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import datetime
 import json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,6 +1,5 @@
 steps:
  - label: "Build wheel - CUDA 12.8"
-    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -12,7 +11,6 @@ steps:
      DOCKER_BUILDKIT: "1"

  - label: "Build wheel - CUDA 12.6"
-    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -30,7 +28,6 @@ steps:

  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
-    id: build-wheel-cuda-11-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -47,7 +44,6 @@ steps:

  - label: "Build release image"
    depends_on: block-release-image-build
-    id: build-release-image
    agents:
      queue: cpu_queue_postmerge
    commands:
@ -55,18 +51,6 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  - label: "Annotate release workflow"
-    depends_on:
-      - build-release-image
-      - build-wheel-cuda-12-8
-      - build-wheel-cuda-12-6
-      - build-wheel-cuda-11-8
-    id: annotate-release-workflow
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-release.sh"
-
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
@ -86,10 +70,9 @@ steps:
      DOCKER_BUILDKIT: "1"

  - input: "Provide Release version here"
-    id: input-release-version
    fields:
      - text: "What is the release version?"
-        key: release-version
+        key: "release-version"

  - block: "Build CPU release image"
    key: block-cpu-release-image-build
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -1,31 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# Get release version and strip leading 'v' if present
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
-
-if [ -z "$RELEASE_VERSION" ]; then
-  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
-  exit 1
-fi
-
-buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
-To download the wheel:
-\`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
-\`\`\`
-
-To download and upload the image:
-
-\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
-\`\`\`
-EOF 
--- a/.buildkite/scripts/ci-clean-log.sh
+++ b/.buildkite/scripts/ci-clean-log.sh
@ -1,17 +0,0 @@
-#!/bin/bash
-# Usage: ./ci_clean_log.sh ci.log
-# This script strips timestamps and color codes from CI log files.
-
-# Check if argument is given
-if [ $# -lt 1 ]; then
-    echo "Usage: $0 ci.log"
-    exit 1
-fi
-
-INPUT_FILE="$1"
-
-# Strip timestamps
-sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
-
-# Strip colorization
-sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -94,10 +94,6 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi

-if [[ $commands == *"pytest -v -s lora"* ]]; then
-  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-fi
-
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -7,7 +7,6 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
-      podman stop --all -t0
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
@ -38,7 +37,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -6,70 +6,72 @@ set -ex

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

-export CMAKE_BUILD_PARALLEL_LEVEL=32
-
 # Setup cleanup
 remove_docker_container() { 
    set -e; 
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
+  export BUILDKITE_BUILD_NUMBER=$3

  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/language/generation -m cpu_model
-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_mllama.py \
-                --ignore=tests/models/multimodal/generation/test_pixtral.py \
-                -m cpu_model"
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

  # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    VLLM_USE_V1=0 pytest -s -v \
+    pytest -s -v \
    tests/quantization/test_ipex_quant.py"

  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  

  # online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
+    export VLLM_CPU_KVCACHE_SPACE=10 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
@ -81,7 +83,7 @@ function cpu_tests() {
      --tokenizer facebook/opt-125m"

  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
@ -89,4 +91,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -150,15 +150,11 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
 run_and_track_test 12 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 14 "test_tpu_qkv_linear.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 15 "test_spmd_model_weight_loading.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"

 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/rerun-test.sh
+++ b/.buildkite/scripts/rerun-test.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-# Usage: ./rerun_test.sh path/to/test.py::test_name
-
-# Check if argument is given
-if [ $# -lt 1 ]; then
-    echo "Usage: $0 path/to/test.py::test_name"
-    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
-    exit 1
-fi
-
-TEST=$1
-COUNT=1
-
-while pytest -sv "$TEST"; do
-    COUNT=$((COUNT + 1))
-    echo "RUN NUMBER ${COUNT}"
-done
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -1,24 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-docker_root=$(docker info -f '{{.DockerRootDir}}')
-if [ -z "$docker_root" ]; then
-  echo "Failed to determine Docker root directory."
-  exit 1
-fi
-echo "Docker root directory: $docker_root"
-# Check disk usage of the filesystem where Docker's root directory is located
-disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
-# Define the threshold
-threshold=70
-if [ "$disk_usage" -gt "$threshold" ]; then
-  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
-  # Remove dangling images (those that are not tagged and not used by any container)
-  docker image prune -f
-  # Remove unused volumes / force the system prune for old images as well.
-  docker volume prune -f && docker system prune --force --filter "until=72h" --all
-  echo "Docker images and volumes cleanup completed."
-else
-  echo "Disk usage is below $threshold%. No cleanup needed."
-fi
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -1,14 +0,0 @@
-# Environment config
-TEST_NAME=llama8b
-CONTAINER_NAME=vllm-tpu
-
-# vllm config
-MODEL=meta-llama/Llama-3.1-8B-Instruct
-MAX_NUM_SEQS=512
-MAX_NUM_BATCHED_TOKENS=512
-TENSOR_PARALLEL_SIZE=1
-MAX_MODEL_LEN=2048
-DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=8.0
-INPUT_LEN=1800
-OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -1,102 +0,0 @@
-#!/bin/bash
-
-if [ ! -f "$1" ]; then
-  echo "Error: The env file '$1' does not exist."
-  exit 1  # Exit the script with a non-zero status to indicate an error
-fi
-
-ENV_FILE=$1
-
-# For testing on local vm, use `set -a` to export all variables
-source /etc/environment
-source $ENV_FILE
-
-remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
-    docker rm -f $CONTAINER_NAME || true;
-}
-
-trap remove_docker_container EXIT
-
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# Build docker image.
-# TODO: build the image outside the script and share the image with other
-# tpu test if building time is too long.
-DOCKER_BUILDKIT=1 docker build \
-  --build-arg max_jobs=16 \
-  --build-arg USE_SCCACHE=1 \
-  --build-arg GIT_REPO_CHECK=0 \
-  --tag vllm/vllm-tpu-bm \
-  --progress plain -f docker/Dockerfile.tpu .
-
-LOG_ROOT=$(mktemp -d)
-# If mktemp fails, set -e will cause the script to exit.
-echo "Results will be stored in: $LOG_ROOT"
-
-if [ -z "$HF_TOKEN" ]; then
-  echo "Error: HF_TOKEN is not set or is empty."  
-  exit 1
-fi
-
-# Make sure mounted disk or dir exists
-if [ ! -d "$DOWNLOAD_DIR" ]; then
-    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
-    exit 1
-fi
-
-echo "Run model $MODEL"
-echo
-
-echo "starting docker...$CONTAINER_NAME"
-echo    
-docker run \
- -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
- --env-file $ENV_FILE \
- -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT=$BUILDKITE_COMMIT \
- -e MODEL=$MODEL \
- -e WORKSPACE=/workspace \
- --name $CONTAINER_NAME \
- -d \
- --privileged \
- --network host \
- -v /dev/shm:/dev/shm \
- vllm/vllm-tpu-bm tail -f /dev/null
-
-echo "run script..."
-echo
-docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
-
-echo "copy result back..."
-VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
-BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
-docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
-docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
-
-throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
-echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
-
-if [ "$BUILDKITE" = "true" ]; then
-  echo "Running inside Buildkite"
-  buildkite-agent artifact upload "$VLLM_LOG" 
-  buildkite-agent artifact upload "$BM_LOG"
-else
-  echo "Not running inside Buildkite"
-fi
-
-#
-# compare the throughput with EXPECTED_THROUGHPUT 
-# and assert meeting the expectation
-# 
-if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
-  echo "Failed to get the throughput"
-  exit 1
-fi
-
-if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
-  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
-  exit 1
-fi
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@ -1,94 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-VLLM_LOG="$WORKSPACE/vllm_log.txt"
-BM_LOG="$WORKSPACE/bm_log.txt"
-
-if [ -n "$TARGET_COMMIT" ]; then
-  head_hash=$(git rev-parse HEAD)
-  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
-    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
-    exit 1
-  fi
-fi
-
-echo "model: $MODEL"
-echo
-
-#
-# create a log folder
-#
-mkdir "$WORKSPACE/log"
-
-# TODO: Move to image building.
-pip install pandas
-pip install datasets
-
-#
-# create sonnet_4x
-#
-echo "Create sonnet_4x.txt"
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
- do
-  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done
-
-#
-# start vllm service in backend
-#
-echo "lanching vllm..."
-echo "logging to $VLLM_LOG"
-echo
-
-VLLM_USE_V1=1 vllm serve $MODEL \
- --seed 42 \
- --disable-log-requests \
- --max-num-seqs $MAX_NUM_SEQS \
- --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
- --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
- --no-enable-prefix-caching \
- --download_dir $DOWNLOAD_DIR \
- --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
-
-
-echo "wait for 20 minutes.."
-echo
-# sleep 1200
-# wait for 10 minutes...
-for i in {1..120}; do
-    # TODO: detect other type of errors.
-    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
-        echo "Detected RuntimeError, exiting."
-        exit 1
-    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
-        echo "Application started"
-        break
-    else
-        echo "wait for 10 seconds..."
-        sleep 10
-    fi
-done
-
-#
-# run test
-#
-echo "run benchmark test..."
-echo "logging to $BM_LOG"
-echo
-python benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --model $MODEL  \
-    --dataset-name sonnet \
-    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len $INPUT_LEN \
-    --sonnet-output-len $OUTPUT_LEN \
-    --ignore-eos > "$BM_LOG"
-
-echo "completed..."
-echo
-
-throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
-echo "throughput: $throughput"
-echo
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -145,7 +145,6 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
-  - tests/v1/engine/test_engine_core_client.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -155,7 +154,6 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -289,7 +287,7 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
@ -320,7 +318,6 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/piecewise/test_full_cudagraph.py

 - label: PyTorch Fullgraph Test # 18min
  mirror_hardwares: [amdexperimental, amdproduction]
@ -424,9 +421,6 @@ steps:
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
-  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -10,17 +10,15 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
+/vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -29,8 +27,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
@ -40,11 +38,11 @@ CMakeLists.txt @tlrmchlsmth
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
-/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
+/tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee

 # Docs
 /docs @hmellor
-mkdocs.yaml @hmellor
+mkdocs.yaml @hmellor
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -8,16 +8,6 @@ body:
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
-  attributes:
-    value: |
-      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
-      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
-      - Passwords or authentication credentials
-      - Private URLs or endpoints
-      - Personal or confidential data
-      
-      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
  attributes:
    label: Your current environment
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,18 +1,6 @@
-## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+FILL IN THE PR DESCRIPTION HERE

-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
-
-## Purpose
-
-## Test Plan
-
-## Test Result
-
-## (Optional) Documentation Update
+FIX #xxxx (*link existing issues this PR will resolve*)

 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -36,20 +36,6 @@ pull_request_rules:
      add:
        - frontend

- name: label-llama
-  description: Automatically apply llama label
-  conditions:
-    - or:
-      - files~=^examples/.*llama.*\.py
-      - files~=^tests/.*llama.*\.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
-      - files~=^vllm/model_executor/models/.*llama.*\.py
-      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
-  actions:
-    label:
-      add:
-        - llama
-
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,8 +11,6 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
-    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
-    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -182,6 +182,9 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

+#
+# Set rocm version dev int.
+#
 if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -189,6 +192,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")

+
  #
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -242,7 +246,6 @@ set(VLLM_EXT_SRC
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
-  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -308,7 +311,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)

    #
@ -454,7 +457,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # kernels for the remaining archs that are not already built for 3x.
  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -543,8 +546,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # CUTLASS MoE kernels

  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
-  # if it's possible to compile MoE kernels that use its output.
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
+  # to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@ -684,7 +687,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)

    #
--- a/README.md
+++ b/README.md
@ -58,8 +58,8 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill

@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
 - Prefix caching support
 - Multi-LoRA support

 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
+- Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)

 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Media Kit

- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -10,15 +10,11 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
-#   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
 #   OUTPUT_LEN: request output len
 #   MIN_CACHE_HIT_PCT: prefix cache rate
 #   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
-#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
-#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
-#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
 # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
 # 5. The final result will be saved in RESULT file. 

@ -34,27 +30,31 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
-TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT=0
+MIN_CACHE_HIT_PCT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"

-echo "result file: $RESULT"
+echo "result file$ $RESULT"
 echo "model: $MODEL"
+echo

 rm -rf $LOG_FOLDER
 mkdir -p $LOG_FOLDER

 cd "$BASE/vllm"
+# create sonnet-4x.txt so that we can sample 2048 tokens for input
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+do
+cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done

-pip install -q datasets
+pip install datasets

 current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
@ -64,69 +64,53 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
-
-start_server() {
-    local gpu_memory_utilization=$1
-    local max_num_seqs=$2
-    local max_num_batched_tokens=$3
-    local vllm_log=$4
-    
-    pkill -f vllm
-
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
-        --disable-log-requests \
-        --port 8004 \
-        --gpu-memory-utilization $gpu_memory_utilization \
-        --max-num-seqs $max_num_seqs \
-        --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size $TP \
-        --enable-prefix-caching \
-        --load-format dummy \
-        --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-
-    # wait for 10 minutes...
-    server_started=0
-    for i in {1..60}; do  
-        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
-        if [[ "$STATUS_CODE" -eq 200 ]]; then
-            server_started=1
-            break
-        else
-            sleep 10
-        fi
-    done
-    if (( ! server_started )); then
-        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
-        return 1
-    else
-        return 0
-    fi
-}
-
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
-    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    pkill -f vllm

-    echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
-    result=$?
-    if [[ "$result" -eq 1 ]]; then
-        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
-    else
-        echo "server started."
-    fi
+    # start the server
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization 0.98 \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size 1 \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir $DOWNLOAD_DIR \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+    echo "wait for 10 minutes.."
    echo
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do        
+        if grep -Fq "Application startup complete" "$vllm_log"; then
+            echo "Application started"
+            server_started=1
+            break
+        else
+            # echo "wait for 10 seconds..."
+            sleep 10
+        fi
+    done
+ 
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
+        echo "pkill -f vllm"
+        echo
+        pkill vllm
+        sleep 10
+        return 1
+    fi
    
    echo "run benchmark test..."
+    echo
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -134,29 +118,29 @@ run_benchmark() {
    python benchmarks/benchmark_serving.py \
        --backend vllm \
        --model $MODEL  \
-        --dataset-name random \
-        --random-input-len $INPUT_LEN \
-        --random-output-len $OUTPUT_LEN \
+        --dataset-name sonnet \
+        --dataset-path benchmarks/sonnet_4x.txt \
+        --sonnet-input-len $INPUT_LEN \
+        --sonnet-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 1000 \
-        --random-prefix-len $prefix_len \
-        --port 8004 &> "$bm_log"
-    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+        --num-prompts 100 \
+        --sonnet-prefix-len $prefix_len \
+        --port 8004 > "$bm_log"
+    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')

    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
        meet_latency_requirement=1
-        request_rate=inf
    fi

    if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(throughput) + 1
-        request_rate=$((${throughput%.*} + 1))
+    # start from request-rate as int(through_put) + 1
+        request_rate=$((${through_put%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@ -165,18 +149,19 @@ run_benchmark() {
            python benchmarks/benchmark_serving.py \
                --backend vllm \
                --model $MODEL  \
-                --dataset-name random \
-                --random-input-len $INPUT_LEN \
-                --random-output-len $OUTPUT_LEN \
-                --ignore-eos \
+                --dataset-name sonnet \
+                --dataset-path benchmarks/sonnet_4x.txt \
+                --sonnet-input-len $INPUT_LEN \
+                --sonnet-output-len $OUTPUT_LEN \
+                --ignore_eos \
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
-                --random-prefix-len $prefix_len \
-                --port 8004 &> "$bm_log"
-            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+                --sonnet-prefix-len $prefix_len \
+                --port 8004 > "$bm_log"
+            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -188,10 +173,10 @@ run_benchmark() {
    fi
    # write the results and update the best result.
    if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
-            best_throughput=$throughput
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
+            best_throughput=$through_put
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
@ -203,39 +188,22 @@ run_benchmark() {

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

+    echo "pkill -f vllm"
+    echo
    pkill vllm
    sleep 10
+    rm -f $vllm_log
    printf '=%.0s' $(seq 1 20)
    return 0
 }

-read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
-read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"

-# first find out the max gpu-memory-utilization without HBM OOM.
-gpu_memory_utilization=0.98
-find_gpu_memory_utilization=0
-while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
-    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
-    result=$?
-    if [[ "$result" -eq 0 ]]; then
-        find_gpu_memory_utilization=1
-        break
-    else
-        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
-    fi
-done
-
-if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
-    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
-else
-    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
-    exit 1
-fi
-
-for num_seqs in "${num_seqs_list[@]}"; do
-    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
-        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
+num_seqs_list="128 256"
+num_batched_tokens_list="512 1024 2048 4096"
+for num_seqs in $num_seqs_list; do
+    for num_batched_tokens in $num_batched_tokens_list; do
+        run_benchmark $num_seqs $num_batched_tokens
+        exit 0
    done
 done
 echo "finish permutations"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import io
 import json
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -865,15 +864,7 @@ class InstructCoderDataset(HuggingFaceDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
-            the code, do not include any explanation."
-
-            # apply template
-            prompt = tokenizer.apply_chat_template(
-                [{"role": "user", "content": prompt}],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
+            prompt = f"{item['instruction']}:\n{item['input']}"
            prompt_len = len(tokenizer(prompt).input_ids)
            sampled_requests.append(
                SampleRequest(
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""

 import argparse
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Offline benchmark to test the long document QA throughput.

--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the efficiency of prefix caching.

--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline prioritization."""

 import argparse
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput with structured outputs.

 On the server side, run one of the following commands:
@ -12,6 +11,7 @@ On the client side, run:
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
+        --structured-output-backend auto \
        --request-rate 10 \
        --num-prompts 1000

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""

 import argparse
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
@ -66,9 +65,4 @@ class InfEncoder(json.JSONEncoder):

 def write_to_json(filename: str, records: list) -> None:
    with open(filename, "w") as f:
-        json.dump(
-            records,
-            f,
-            cls=InfEncoder,
-            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
-        )
+        json.dump(records, f, cls=InfEncoder)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Cutlass bench utils
 from collections.abc import Iterable
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os

--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
 import itertools
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json

--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pickle as pkl
 import time
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -1,15 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools

 import torch
+import triton
 from weight_shapes import WEIGHT_SHAPES

 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
-from vllm.triton_utils import triton


@triton.testing.perf_report(
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import sys
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
 kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -91,7 +90,7 @@ def bench_run(

    score = torch.randn((m, num_experts), device=device, dtype=dtype)

-    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)

    quant_blocksize = 16
    w1_blockscale = torch.empty(
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch.utils.benchmark as benchmark
@ -7,8 +6,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
+    cutlass_moe_fp8,
    fused_experts,
    fused_topk,
 )
@ -70,9 +69,18 @@ def bench_run(
    w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
    w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)

+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
    for expert in range(num_experts):
        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+    w1_q_notransp = w1_q.clone()
+    w2_q_notransp = w2_q.clone()
+    w1_q = w1_q.transpose(1, 2)
+    w2_q = w2_q.transpose(1, 2)

    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)

@ -113,6 +121,10 @@ def bench_run(
        w2_scale: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides2: torch.Tensor,
        num_repeats: int,
    ):
        for _ in range(num_repeats):
@ -120,10 +132,14 @@ def bench_run(
                a,
                w1,
                w2,
-                topk_weights,
-                topk_ids,
                w1_scale,
                w2_scale,
+                topk_weights,
+                topk_ids,
+                ab_strides1,
+                c_strides1,
+                ab_strides2,
+                c_strides2,
                a1_scale=a_scale,
            )

@ -136,6 +152,10 @@ def bench_run(
        w2_scale: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides2: torch.Tensor,
    ):
        with set_current_vllm_config(
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -144,10 +164,14 @@ def bench_run(
                a,
                w1_q,
                w2_q,
-                topk_weights,
-                topk_ids,
                w1_scale,
                w2_scale,
+                topk_weights,
+                topk_ids,
+                ab_strides1,
+                c_strides1,
+                ab_strides2,
+                c_strides2,
                a1_scale=a_scale,
            )

@ -193,6 +217,10 @@ def bench_run(
            w2_scale,
            topk_weights,
            topk_ids,
+            ab_strides1,
+            c_strides1,
+            ab_strides2,
+            c_strides2,
        )
    torch.cuda.synchronize()

@ -201,8 +229,8 @@ def bench_run(
    with torch.cuda.graph(triton_graph, stream=triton_stream):
        run_triton_from_graph(
            a,
-            w1_q,
-            w2_q,
+            w1_q_notransp,
+            w2_q_notransp,
            topk_weights,
            topk_ids,
            w1_scale,
@ -221,12 +249,18 @@ def bench_run(
        "w2": w2,
        "score": score,
        "topk": topk,
+        "w1_q_notransp": w1_q_notransp,
+        "w2_q_notransp": w2_q_notransp,
        # Cutlass params
        "a_scale": a_scale,
        "w1_q": w1_q,
        "w2_q": w2_q,
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
+        "ab_strides1": ab_strides1,
+        "c_strides1": c_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -244,8 +278,8 @@ def bench_run(
    # Warmup
    run_triton_moe(
        a,
-        w1_q,
-        w2_q,
+        w1_q_notransp,
+        w2_q_notransp,
        topk_weights,
        topk_ids,
        w1_scale,
@ -256,7 +290,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -287,12 +321,16 @@ def bench_run(
        w2_scale,
        topk_weights,
        topk_ids,
+        ab_strides1,
+        c_strides1,
+        ab_strides2,
+        c_strides2,
        num_warmup,
    )

    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import copy
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import torch
 import torch.utils.benchmark as benchmark
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 import json
@ -7,6 +6,7 @@ import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
+from types import SimpleNamespace
 from typing import Any, TypedDict

 import ray
@ -42,7 +42,7 @@ def benchmark_config(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    block_quant_shape: list[int] = None,
+    block_quant_shape: List[int] = None,
    use_deep_gemm: bool = False,
 ) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        block_quant_shape: list[int] = None,
+        block_quant_shape: List[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
-    block_quant_shape: list[int],
+    block_quant_shape: List[int],
 ) -> None:
    dtype_str = get_config_dtype_str(
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,6 +562,7 @@ def main(args: argparse.Namespace):
    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
    if args.model_prefix:
        config = getattr(config, args.model_prefix)
+    config = SimpleNamespace(**config)

    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
@ -593,7 +594,11 @@ def main(args: argparse.Namespace):
        shard_intermediate_size = 2 * intermediate_size // args.tp_size

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = (
+        torch.float16
+        if current_platform.is_rocm()
+        else getattr(torch, config.torch_dtype)
+    )
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
 from typing import Any, TypedDict
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
 import time
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time

--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import itertools
 from typing import Optional, Union
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from itertools import accumulate
 from typing import Optional
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 WEIGHT_SHAPES = {
    "ideal": [[4 * 256 * 32, 256 * 32]],
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from sglang quantization/tuning_block_wise_kernel.py

 import argparse
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import math
 import pickle
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
 from collections.abc import Iterable
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import cProfile
 import pstats
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -75,7 +75,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
 else()
    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -107,19 +106,13 @@ elseif (AVX2_FOUND)
    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
    message(WARNING "vLLM CPU backend using AVX2 ISA")
    
-elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+elseif (POWER9_FOUND OR POWER10_FOUND)
    message(STATUS "PowerPC detected")
-    if (POWER9_FOUND)
-        list(APPEND CXX_COMPILE_FLAGS
-            "-mvsx"
-            "-mcpu=power9"
-            "-mtune=power9")
-    elseif (POWER10_FOUND OR POWER11_FOUND)
-        list(APPEND CXX_COMPILE_FLAGS
-            "-mvsx"
-            "-mcpu=power10"
-            "-mtune=power10")
-    endif()
+    # Check for PowerPC VSX support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvsx"
+        "-mcpu=native"
+        "-mtune=native")

 elseif (ASIMD_FOUND)
    message(STATUS "ARMv8 or later architecture detected")
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
       static_cast<ElementAcc*>(nullptr), stride_LSE},
      hw_info,
-      1,        // split_kv
+      -1,       // split_kv
      nullptr,  // is_var_split_kv
  };
  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import enum
 from typing import Union
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -30,8 +30,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             int64_t BLOCK_SIZE_K, int64_t bit);
 #endif

-bool moe_permute_unpermute_supported();
-
-void shuffle_rows(const torch::Tensor& input_tensor,
-                  const torch::Tensor& dst2src_map,
-                  torch::Tensor& output_tensor);
+bool moe_permute_unpermute_supported();
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -130,62 +130,6 @@ void moe_unpermute(
  });
 }

-template <typename T>
-__global__ void shuffleInputRowsKernel(const T* input,
-                                       const int32_t* dst2src_map, T* output,
-                                       int64_t num_src_rows,
-                                       int64_t num_dst_rows, int64_t num_cols) {
-  int64_t dest_row_idx = blockIdx.x;
-  int64_t const source_row_idx = dst2src_map[dest_row_idx];
-
-  if (blockIdx.x < num_dst_rows) {
-    // Load 128-bits per thread
-    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
-    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
-
-    // Duplicate and permute rows
-    auto const* source_row_ptr =
-        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
-    auto* dest_row_ptr =
-        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
-
-    int64_t const start_offset = threadIdx.x;
-    int64_t const stride = blockDim.x;
-    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
-
-    for (int elem_index = start_offset; elem_index < num_elems_in_col;
-         elem_index += stride) {
-      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
-    }
-  }
-}
-
-void shuffle_rows(const torch::Tensor& input_tensor,
-                  const torch::Tensor& dst2src_map,
-                  torch::Tensor& output_tensor) {
-  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
-              "Input and output tensors must have the same data type");
-
-  auto stream = at::cuda::getCurrentCUDAStream().stream();
-  int64_t const blocks = output_tensor.size(0);
-  int64_t const threads = 256;
-  int64_t const num_dest_rows = output_tensor.size(0);
-  int64_t const num_src_rows = input_tensor.size(0);
-  int64_t const num_cols = input_tensor.size(1);
-
-  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
-              "num_cols must be divisible by 128 / "
-              "sizeof(input_tensor.scalar_type()) / 8");
-
-  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
-    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
-        dst2src_map.data_ptr<int32_t>(),
-        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
-        num_dest_rows, num_cols);
-  });
-}
-
 #else

 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@ -14,13 +14,12 @@
    __VA_ARGS__();                                         \
    break;                                                 \
  }
-#define MOE_DISPATCH_FLOAT_CASE(...)                            \
-  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
-  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
-  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
-  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+#define MOE_DISPATCH_FLOAT_CASE(...)                          \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)

 #define MOE_DISPATCH(TYPE, ...) \
  MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -40,11 +39,6 @@ template <>
 struct ScalarType2CudaType<at::ScalarType::BFloat16> {
  using type = __nv_bfloat16;
 };
-// uint8 for packed fp4
-template <>
-struct ScalarType2CudaType<at::ScalarType::Byte> {
-  using type = uint8_t;
-};

 // #if __CUDA_ARCH__ >= 890
 // fp8
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -516,8 +516,9 @@ void topk_softmax(
            topk,
            stream);
    }
-    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
+    else
    {
+        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
@ -529,17 +530,4 @@ void topk_softmax(
            topk,
            stream);
    }
-    else {
-        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int64_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -81,12 +81,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def("moe_permute_unpermute_supported() -> bool");
  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);

-  // Row shuffle for MoE
-  m.def(
-      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
-      "output_tensor) -> ()");
-  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
-
 #endif
 }

--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,11 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

-void apply_repetition_penalties_(torch::Tensor& logits,
-                                 const torch::Tensor& prompt_mask,
-                                 const torch::Tensor& output_mask,
-                                 const torch::Tensor& repetition_penalties);
-
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
                               double epsilon);
@ -236,8 +231,7 @@ void cutlass_moe_mm(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);

 void cutlass_fp4_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -249,16 +243,7 @@ void get_cutlass_moe_mm_data(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k,
-    const std::optional<torch::Tensor>& blockscale_offsets);
-
-void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
-                                  torch::Tensor& problem_sizes1,
-                                  torch::Tensor& problem_sizes2,
-                                  const torch::Tensor& expert_num_tokens,
-                                  const int64_t num_local_experts,
-                                  const int64_t padded_m, const int64_t n,
-                                  const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k);

 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                           torch::Tensor const& b,
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@ -9,6 +9,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                           torch::Tensor const& b,
                                           torch::Tensor const& a_scales,
                                           torch::Tensor const& b_scales) {
+  TORCH_CHECK(
+      a.size(0) % 4 == 0,
+      "Input tensor must have a number of rows that is a multiple of 4. ",
+      "but got: ", a.size(0), " rows.");
  if (out.dtype() == torch::kBFloat16) {
    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
        out, a, b, a_scales, b_scales);
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -1,6 +1,5 @@
 #pragma once

-#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"

@ -23,49 +22,49 @@ namespace vllm {

 using namespace cute;

-// clang-format off
-template <class OutType, int ScaleGranularityM,
-          int ScaleGranularityN, int ScaleGranularityK,
-          class MmaTileShape, class ClusterShape,
-          class EpilogueScheduler, class MainloopScheduler,
-          bool swap_ab_ = false>
+template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
+          class ClusterShape, typename EpilogueScheduler,
+          typename MainloopScheduler>
 struct cutlass_3x_gemm_fp8_blockwise {
-  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = cutlass::float_e4m3_t;

  using ElementA = ElementAB;
  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;

  using ElementB = ElementAB;
  using LayoutB = cutlass::layout::ColumnMajor;
-  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;

+  using ElementC = void;
  using ElementD = OutType;
  using LayoutD = cutlass::layout::RowMajor;
-  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;

-  using ElementC = void; // TODO: support bias
  using LayoutC = LayoutD;
-  using LayoutC_Transpose = LayoutD_Transpose;
  static constexpr int AlignmentC = AlignmentD;

  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBlockScale = float;

-  using ScaleConfig = conditional_t<swap_ab,
-      cutlass::detail::Sm100BlockwiseScaleConfig<
-        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
-      cutlass::detail::Sm100BlockwiseScaleConfig<
-        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+  // MMA and Cluster Tile Shapes
+  // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
+  // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
+  static constexpr int ScaleGranularityM =
+      size<0>(MmaTileShape{}) / ScaleMsPerTile;
+  static constexpr int ScaleGranularityN =
+      size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
+  static constexpr int ScaleGranularityK =
+      size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});

-  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  // Shape of the threadblocks in a cluster
+  using ClusterShape_MNK = ClusterShape;
+
+  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+      ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+      cute::UMMA::Major::MN, cute::UMMA::Major::K>;
  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());

@ -74,6 +73,7 @@ struct cutlass_3x_gemm_fp8_blockwise {

  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
  using ElementScalar = float;
+  // clang-format off
  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      ArchTag,
@ -84,47 +84,33 @@ struct cutlass_3x_gemm_fp8_blockwise {
      ElementAccumulator,
      ElementCompute,
      ElementC,
-      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
+      LayoutC,
      AlignmentC,
      ElementD,
-      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+      LayoutD,
      AlignmentD,
      EpilogueScheduler,
      DefaultOperation
  >::CollectiveOp;
 
  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = conditional_t<swap_ab,
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag,
-          OperatorClass,
-          ElementB,
-          cute::tuple<LayoutB_Transpose, LayoutSFA>,
-          AlignmentB,
-          ElementA,
-          cute::tuple<LayoutA_Transpose, LayoutSFB>,
-          AlignmentA,
-          ElementAccumulator,
-          MmaTileShape,
-          ClusterShape,
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          MainloopScheduler
-      >::CollectiveOp,
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag,
-          OperatorClass,
-          ElementA,
-          cute::tuple<LayoutA, LayoutSFA>,
-          AlignmentA,
-          ElementB,
-          cute::tuple<LayoutB, LayoutSFB>,
-          AlignmentB,
-          ElementAccumulator,
-          MmaTileShape,
-          ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          MainloopScheduler
-      >::CollectiveOp>;
+      MainloopScheduler
+  >::CollectiveOp;
+  // clang-format on

  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@ -137,7 +123,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                   torch::Tensor const& b,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales) {
-  static constexpr bool swap_ab = Gemm::swap_ab;
  using GemmKernel = typename Gemm::GemmKernel;
  using StrideA = typename Gemm::GemmKernel::StrideA;
  using StrideB = typename Gemm::GemmKernel::StrideB;
@ -151,6 +136,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  using ElementD = typename Gemm::ElementD;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape = cute::make_shape(m, n, k, 1);

  StrideA a_stride;
  StrideB b_stride;
@ -160,13 +146,11 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  b_stride =
      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
  c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));

-  LayoutSFA layout_SFA = swap_ab ? 
-      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
+  LayoutSFA layout_SFA =
      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB = swap_ab ?
-      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
+  LayoutSFB layout_SFB =
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@ -174,22 +158,9 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  auto mainloop_args = [&](){
-    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
-    if (swap_ab) {
-      return typename GemmKernel::MainloopArguments{
-          b_ptr,        b_stride,   a_ptr,        a_stride,
-          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
-      };
-    }
-    else {
-      return typename GemmKernel::MainloopArguments{
-          a_ptr,        a_stride,   b_ptr,        b_stride,
-          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
-      };
-    }
-  }();
-  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr,        a_stride,   b_ptr,        b_stride,
+      a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
@ -204,74 +175,29 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
                                               torch::Tensor const& b,
                                               torch::Tensor const& a_scales,
                                               torch::Tensor const& b_scales) {
-  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
+  auto m = a.size(0);
+  auto k = a.size(1);
+  auto n = b.size(1);
+  int sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());

-  constexpr int TILE_K = 128;
-  // TODO: better heuristics
-  bool swap_ab = (m < 16) || (m % 4 != 0);
-  bool use_tma_epilogue = (m * n) % 4 == 0;
-  if (!swap_ab) {
-    constexpr int TILE_N = 128;
-    int tile_m = 256;
-    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
-      tile_m = 64;
-    }
-    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
-      tile_m = 128;
-    }
-    if (tile_m == 64) {
-      if (use_tma_epilogue) {
-        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      } else {
-        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      }
-    } else if (tile_m == 128) {
-      if (use_tma_epilogue) {
-        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      } else {
-        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      }
-    } else { // tile_m == 256
-      if (use_tma_epilogue) {
-          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      } else {
-          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
-            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-            out, a, b, a_scales, b_scales);
-      }
-    }
-  } else {
-    // TODO: Test more tile N configs
-    constexpr int TILE_M = 128;
-    constexpr int TILE_N = 16;
-    // TMA epilogue isn't compatible with Swap A/B
+  auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
+    return std::ceil(static_cast<float>(m) / tile1SM) *
+               std::ceil(static_cast<float>(n) / tile1SM) >=
+           sms;
+  };
+  bool use_2sm = should_use_2sm(m, n);
+  if (use_2sm) {
    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
-        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
+        OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
+        Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
+        Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
        out, a, b, a_scales, b_scales);
  }
 }
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -15,7 +15,6 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
-  // M in (128, inf)
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@ -26,34 +25,6 @@ struct sm100_fp8_config_default {
                            KernelSchedule, EpilogueSchedule>;
 };

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_128, _128, _64>;
-  using ClusterShape = Shape<_2, _2, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue,
          typename... EpilogueArgs>
@ -68,28 +39,8 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
  using Cutlass3xGemmDefault =
      typename sm100_fp8_config_default<InType, OutType,
                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
 }

 template <template <typename, typename, typename> typename Epilogue,
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@ -84,8 +84,7 @@ void run_cutlass_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@ -114,23 +113,19 @@ void run_cutlass_moe_mm_sm90(
  if (n >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  } else if (k >= 8192) {
    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  } else if (m <= 16) {
    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  } else {
    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  }
 }

@ -139,18 +134,15 @@ void dispatch_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
  if (out_tensors.dtype() == torch::kBFloat16) {
    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  } else {
    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
-        per_out_ch);
+        problem_sizes, a_strides, b_strides, c_strides);
  }
 }

@ -161,9 +153,8 @@ void cutlass_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                       expert_offsets, problem_sizes, a_strides, b_strides,
-                       c_strides, per_act_token, per_out_ch);
+                       c_strides);
 }
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@ -76,8 +76,7 @@ void cutlass_group_gemm_caller(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

@ -85,6 +84,9 @@ void cutlass_group_gemm_caller(
  int k_size = a_tensors.size(1);
  int n_size = out_tensors.size(1);

+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

  auto options_int =
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@ -7,7 +7,7 @@

 constexpr uint64_t THREADS_PER_EXPERT = 512;

-__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
                                      int32_t* problem_sizes1,
                                      int32_t* problem_sizes2,
                                      int32_t* atomic_buffer,
@ -45,24 +45,7 @@ __global__ void compute_expert_offsets(
  }
 }

-__global__ void compute_expert_blockscale_offsets(
-    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
-    int32_t* blockscale_offsets, int32_t* atomic_buffer,
-    const int num_experts) {
-  int32_t tot_offset = 0;
-  int32_t tot_offset_round = 0;
-  expert_offsets[0] = 0;
-  blockscale_offsets[0] = 0;
-  for (int i = 0; i < num_experts; ++i) {
-    atomic_buffer[i] = tot_offset;
-    tot_offset += problem_sizes1[i * 3];
-    expert_offsets[i + 1] = tot_offset;
-    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
-    blockscale_offsets[i + 1] = tot_offset_round;
-  }
-}
-
-__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
+__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
                                  const int32_t* __restrict__ expert_offsets,
                                  int32_t* input_permutation,
                                  int32_t* output_permutation,
@ -94,8 +77,7 @@ void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k,
-    const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t num_experts, const int64_t n, const int64_t k) {
  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
  auto options_int32 =
      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@ -103,61 +85,19 @@ void get_cutlass_moe_mm_data_caller(

  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const uint32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
      static_cast<int32_t*>(problem_sizes1.data_ptr()),
      static_cast<int32_t*>(problem_sizes2.data_ptr()),
      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
-  if (blockscale_offsets.has_value()) {
-    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
-        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(expert_offsets.data_ptr()),
-        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
-  } else {
-    compute_expert_offsets<<<1, 1, 0, stream>>>(
-        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(expert_offsets.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
-  }
+  compute_expert_offsets<<<1, 1, 0, stream>>>(
+      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const uint32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
      static_cast<const int32_t*>(expert_offsets.data_ptr()),
      static_cast<int32_t*>(input_permutation.data_ptr()),
      static_cast<int32_t*>(output_permutation.data_ptr()),
      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
      topk_ids.size(1));
 }
-
-__global__ void compute_pplx_data(int32_t* expert_offsets,
-                                  int32_t* problem_sizes1,
-                                  int32_t* problem_sizes2,
-                                  const int32_t* __restrict__ expert_num_tokens,
-                                  const int padded_m, const int n,
-                                  const int k) {
-  int expert_idx = threadIdx.x;
-
-  expert_offsets[expert_idx] = expert_idx * padded_m;
-  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
-  problem_sizes1[expert_idx * 3 + 2] = k;
-  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes2[expert_idx * 3 + 1] = k;
-  problem_sizes2[expert_idx * 3 + 2] = n;
-}
-
-void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
-                                         torch::Tensor& problem_sizes1,
-                                         torch::Tensor& problem_sizes2,
-                                         const torch::Tensor& expert_num_tokens,
-                                         const int64_t num_local_experts,
-                                         const int64_t padded_m,
-                                         const int64_t n, const int64_t k) {
-  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
-
-  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(problem_sizes2.data_ptr()),
-      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
-      k);
-}
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -36,8 +36,7 @@ void cutlass_moe_mm_sm90(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);

 #endif

@ -55,16 +54,7 @@ void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k,
-    const std::optional<torch::Tensor>& blockscale_offsets);
-
-void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
-                                         torch::Tensor& problem_sizes1,
-                                         torch::Tensor& problem_sizes2,
-                                         const torch::Tensor& expert_num_tokens,
-                                         const int64_t num_local_experts,
-                                         const int64_t padded_m,
-                                         const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k);
 #endif

 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@ -216,13 +206,12 @@ void cutlass_moe_mm(
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
  cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                      expert_offsets, problem_sizes, a_strides, b_strides,
-                      c_strides, per_act_token, per_out_ch);
+                      c_strides);
  return;
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
@ -235,8 +224,7 @@ void get_cutlass_moe_mm_data(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k,
-    const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t num_experts, const int64_t n, const int64_t k) {
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
@ -244,8 +232,7 @@ void get_cutlass_moe_mm_data(
    (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
-                                 output_permutation, num_experts, n, k,
-                                 blockscale_offsets);
+                                 output_permutation, num_experts, n, k);
  return;
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
@ -255,29 +242,6 @@ void get_cutlass_moe_mm_data(
      version_num, ". Required capability: 90");
 }

-void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
-                                  torch::Tensor& problem_sizes1,
-                                  torch::Tensor& problem_sizes2,
-                                  const torch::Tensor& expert_num_tokens,
-                                  const int64_t num_local_experts,
-                                  const int64_t padded_m, const int64_t n,
-                                  const int64_t k) {
-  // This function currently gets compiled only if we have a valid cutlass moe
-  // mm to run it for.
-  int32_t version_num = get_sm_version_num();
-#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
-                                      problem_sizes2, expert_num_tokens,
-                                      num_local_experts, padded_m, n, k);
-  return;
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
-      "for CUDA device capability: ",
-      version_num, ". Required capability: 90");
-}
-
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                           torch::Tensor const& b,
                           torch::Tensor const& a_scales,
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
  fp8_type* __restrict__ token_output = &out[offset];

  // For vectorization, token_input and token_output pointers need to be
-  // aligned at 32-byte and 16-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 16 == 0;
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;

  float absmax_val = 0.0f;
  if (can_vectorize) {
@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
  } else {
    for (int i = tid; i < hidden_size; i += blockDim.x) {
      float const x = static_cast<float>(token_input[i]);
-      absmax_val = fmaxf(absmax_val, fabsf(x));
+      absmax_val = max(absmax_val, fabs(x));
    }
  }

-  using BlockReduce = cub::BlockReduce<float, 256>;
+  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStorage;
  float const block_absmax_val_maybe =
      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
  __shared__ float token_scale;
  if (tid == 0) {
    if (scale_ub) {
-      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
+      token_scale = min(block_absmax_val_maybe, *scale_ub);
    } else {
      token_scale = block_absmax_val_maybe;
    }
    // token scale computation
-    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
-                        min_scaling_factor<fp8_type>::val());
+    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
+                      min_scaling_factor<fp8_type>::val());
    scale[token_idx] = token_scale;
  }
  __syncthreads();
@ -88,11 +88,10 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                             torch::Tensor const& input,  // [..., d]
                             torch::Tensor const& scale)  // [1]
 {
-  int const block_size = 256;
-  int const num_tokens = input.numel() / input.size(-1);
-  int const num_elems = input.numel();
-  dim3 const grid(num_tokens);
-  dim3 const block(block_size);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
@ -111,11 +110,10 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor& scale)        // [1]
 {
-  int const block_size = 256;
-  int const num_tokens = input.numel() / input.size(-1);
-  int const num_elems = input.numel();
-  dim3 const grid(num_tokens);
-  dim3 const block(block_size);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
@ -143,9 +141,8 @@ void dynamic_per_token_scaled_fp8_quant(

  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
-  int const block_size = 256;
  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, block_size));
+  dim3 const block(std::min(hidden_size, 1024));

  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
  }

  float r =
-      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
  return static_cast<fp8_type>(r);
 #else
@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                        const scalar_t* __restrict__ input,
                                        int64_t num_elems) {
-  __shared__ float cache[256];
+  __shared__ float cache[1024];
  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;

  // First store maximum for all values processes by
@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
  scalar_t tmp = 0.0;
  while (i < num_elems) {
    float x = static_cast<float>(input[i]);
-    tmp = fmaxf(tmp, fabsf(x));
+    tmp = max(tmp, fabs(x));
    i += blockDim.x * gridDim.x;
  }
  cache[threadIdx.x] = tmp;
@ -100,27 +100,25 @@ template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                int64_t const num_elems, int const tid,
                                int const step) {
-  constexpr size_t VEC_SIZE = 16;
-  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
  // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  vec4_t<scalar_t> const* vectorized_in =
+      reinterpret_cast<vec4_t<scalar_t> const*>(input);

-  // num_elems / VEC_SIZE (which is 16)
-  int64_t const num_vec_elems = num_elems >> 4;
+  int64_t const num_vec_elems = num_elems >> 2;
  float absmax_val = 0.0f;

-#pragma unroll
+#pragma unroll 4
  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    scalarxN_t in_vec = vectorized_in[i];
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
-    }
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    absmax_val = max(absmax_val, fabs(in_vec.x));
+    absmax_val = max(absmax_val, fabs(in_vec.y));
+    absmax_val = max(absmax_val, fabs(in_vec.z));
+    absmax_val = max(absmax_val, fabs(in_vec.w));
  }

-  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
-  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
-    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+    absmax_val = max(absmax_val, fabs(input[i]));
  }

  return absmax_val;
@ -132,31 +130,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                          float const scale,
                                          int64_t const num_elems,
                                          int const tid, int const step) {
-  constexpr size_t VEC_SIZE = 16;
-  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
-  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
+  using float8x4_t = q8x4_t<fp8_type>;
  // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);

-  // num_elems / VEC_SIZE (which is 16)
-  int64_t const num_vec_elems = num_elems >> 4;
+  int64_t const num_vec_elems = num_elems >> 2;

-#pragma unroll
+#pragma unroll 4
  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    scalarxN_t in_vec = vectorized_in[i];
-    float8xN_t out_vec;
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    float8x4_t out_vec;

-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-          static_cast<float>(in_vec.val[j]), scale);
-    }
+    out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+        static_cast<float>(in_vec.x), scale);
+    out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+        static_cast<float>(in_vec.y), scale);
+    out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+        static_cast<float>(in_vec.z), scale);
+    out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+        static_cast<float>(in_vec.w), scale);
    vectorized_out[i] = out_vec;
  }

-  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
-  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
    out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
        static_cast<float>(input[i]), scale);
  }
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -140,7 +140,6 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  // sum of squares
  float ss = 0.0f;

-  const int VEC_SIZE = 4;
  int32_t const num_vec_elems = hidden_size >> 2;

 #pragma unroll 4
@ -148,23 +147,22 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
    vec4_t<scalar_t> in = vec_input[i];

    vec4_t<float> x;
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      x.val[j] = static_cast<float>(in.val[j]);
-    }
-
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
    if constexpr (has_residual) {
      vec4_t<scalar_t> r = vec_residual[i];
-#pragma unroll
-      for (int j = 0; j < VEC_SIZE; ++j) {
-        x.val[j] += static_cast<float>(r.val[j]);
-      }
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
    }

-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      ss += x.val[j] * x.val[j];
-    }
+    ss += x.x * x.x;
+    ss += x.y * x.y;
+    ss += x.z * x.z;
+    ss += x.w * x.w;
  }

  using BlockReduce = cub::BlockReduce<float, 1024>;
@ -205,7 +203,6 @@ __device__ void compute_dynamic_per_token_scales(

  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};

-  const int VEC_SIZE = 4;
  int32_t const num_vec_elems = hidden_size >> 2;
  float block_absmax_val_maybe = 0.0f;

@ -215,25 +212,26 @@ __device__ void compute_dynamic_per_token_scales(
    vec4_t<scalar_t> const w = vec_weight[i];

    vec4_t<float> x;
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      x.val[j] = static_cast<float>(in.val[j]);
-    }
-
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
    if constexpr (has_residual) {
      vec4_t<scalar_t> r = vec_residual[i];
-#pragma unroll
-      for (int j = 0; j < VEC_SIZE; ++j) {
-        x.val[j] += static_cast<float>(r.val[j]);
-      }
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
    }

-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      block_absmax_val_maybe =
-          fmaxf(block_absmax_val_maybe,
-                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
-    }
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
  }

  using BlockReduce = cub::BlockReduce<float, 1024>;
@ -284,7 +282,6 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
  }

-  const int VEC_SIZE = 4;
  int32_t const num_vec_elems = hidden_size >> 2;

 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
@ -295,31 +292,33 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
    vec4_t<scalar_t> const w = vec_weight[i];

    vec4_t<float> x;
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      x.val[j] = static_cast<float>(in.val[j]);
-    }
-
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
    if constexpr (has_residual) {
      vec4_t<scalar_t> r = vec_residual[i];
-#pragma unroll
-      for (int j = 0; j < VEC_SIZE; ++j) {
-        x.val[j] += static_cast<float>(r.val[j]);
-      }
-// Update residual
-#pragma unroll
-      for (int j = 0; j < VEC_SIZE; ++j) {
-        r.val[j] = static_cast<scalar_t>(x.val[j]);
-      }
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+      // Update residual
+      r.x = static_cast<scalar_t>(x.x);
+      r.y = static_cast<scalar_t>(x.y);
+      r.z = static_cast<scalar_t>(x.z);
+      r.w = static_cast<scalar_t>(x.w);
      vec_residual[i] = r;
    }

    q8x4_t<scalar_out_t> out;
-#pragma unroll
-    for (int j = 0; j < VEC_SIZE; ++j) {
-      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
-    }
+    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.x * rms) * w.x, scale);
+    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.y * rms) * w.y, scale);
+    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.z * rms) * w.z, scale);
+    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.w * rms) * w.w, scale);
    vec_output[i] = out;
  }
 }
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import itertools
 import math
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@ -10,22 +10,23 @@
 namespace vllm {

 // Vectorization containers
-template <typename scalar_t, size_t vec_size>
-struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
-  scalar_t val[vec_size];
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
 };

-template <typename quant_type_t, size_t vec_size>
-struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
  static_assert(std::is_same_v<quant_type_t, int8_t> ||
                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
-  quant_type_t val[vec_size];
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
 };

-template <typename scalar_t>
-using vec4_t = vec_n_t<scalar_t, 4>;
-template <typename quant_type_t>
-using q8x4_t = q8_n_t<quant_type_t, 4>;
-
 }  // namespace vllm
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -1,86 +0,0 @@
-#include "dispatch_utils.h"
-
-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
-
-namespace vllm {
-
-template <typename scalar_t>
-__global__ void apply_repetition_penalties_kernel(
-    scalar_t* __restrict__ logits,         // [num_seqs, vocab_size]
-    const bool* __restrict__ prompt_mask,  // [num_seqs, vocab_size]
-    const bool* __restrict__ output_mask,  // [num_seqs, vocab_size]
-    const scalar_t* __restrict__ repetition_penalties,  // [num_seqs]
-    const int num_seqs, const int vocab_size, const int tile_size) {
-  // Each block handles one sequence and a tile of vocab
-  const int seq_idx = blockIdx.x;
-  if (seq_idx >= num_seqs) return;
-
-  const int tile_start = blockIdx.y * tile_size;
-  const int tile_end = min(tile_start + tile_size, vocab_size);
-
-  // Load repetition penalty for this sequence
-  const scalar_t penalty = repetition_penalties[seq_idx];
-
-  // Each thread processes multiple vocab items within the tile
-  for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
-       vocab_idx += blockDim.x) {
-    const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
-    const bool is_repeated = prompt_mask[idx] || output_mask[idx];
-    if (is_repeated) {
-      scalar_t logit = logits[idx];
-      if (logit > 0) {
-        logits[idx] = logit / penalty;
-      } else {
-        logits[idx] = logit * penalty;
-      }
-    }
-  }
-}
-
-}  // namespace vllm
-
-void apply_repetition_penalties_(
-    torch::Tensor& logits,             // [num_seqs, vocab_size], in-place
-    const torch::Tensor& prompt_mask,  // [num_seqs, vocab_size]
-    const torch::Tensor& output_mask,  // [num_seqs, vocab_size]
-    const torch::Tensor& repetition_penalties) {  // [num_seqs]
-  TORCH_CHECK(logits.is_contiguous());
-  TORCH_CHECK(prompt_mask.is_contiguous());
-  TORCH_CHECK(output_mask.is_contiguous());
-  TORCH_CHECK(repetition_penalties.is_contiguous());
-
-  int vocab_size = logits.size(-1);
-  int num_seqs = logits.size(0);
-
-  // Get number of SMs on the current device
-  int sms = 0;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
-                         logits.get_device());
-
-  // Compute tile_num and tile_size
-  int tile_num =
-      std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
-  int tile_size = (vocab_size + tile_num - 1) / tile_num;
-
-  // Each block handles one sequence and a tile of vocab
-  dim3 grid(num_seqs, tile_num);
-  dim3 block(std::min(tile_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
-        vllm::apply_repetition_penalties_kernel<scalar_t>
-            <<<grid, block, 0, stream>>>(
-                logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
-                output_mask.data_ptr<bool>(),
-                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
-                tile_size);
-      });
-}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -170,13 +170,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);

-  // Apply repetition penalties to logits in-place
-  ops.def(
-      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
-      "Tensor output_mask, Tensor repetition_penalties) -> ()");
-  ops.impl("apply_repetition_penalties_", torch::kCUDA,
-           &apply_repetition_penalties_);
-
  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@ -435,8 +428,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
      "               Tensor problem_sizes, Tensor a_strides, "
-      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
-      "               bool per_out_ch) -> ()",
+      "               Tensor b_strides, Tensor c_strides) -> ()",
      {stride_tag});
  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);

@ -451,26 +443,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
      "                        Tensor! input_permutation, "
      "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
+      "                        int n, int k) -> ()",
      {stride_tag});
  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);

-  // A function that computes data required to run fused MoE with w8a8 grouped
-  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
-  // as an input, and computes expert_offsets (token start indices of each
-  // expert). In addition to this, it computes problem sizes for each expert's
-  // multiplication used by the two mms called from fused MoE operation.
-  ops.def(
-      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
-      "                             Tensor! problem_sizes1, "
-      "                             Tensor! problem_sizes2, "
-      "                             Tensor expert_num_tokens, "
-      "                             int num_local_experts, int padded_m, "
-      "                             int n, int k) -> ()",
-      {stride_tag});
-  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
-           &get_cutlass_pplx_moe_mm_data);
-
  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
  ops.def(
      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -75,7 +75,6 @@ RUN --mount=type=bind,source=.git,target=.git \

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 

@ -86,7 +85,7 @@ WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils
+    apt-get install -y --no-install-recommends vim numactl

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -109,11 +108,8 @@ FROM base AS vllm-test
 WORKDIR /workspace/

 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
-    cp requirements/test.in requirements/test-cpu.in && \
-    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
-    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
-    uv pip install -r requirements/cpu-test.txt
+    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
+    uv pip install -r requirements/test.txt

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 RUN python3 -m pip install -U \
-        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        -r requirements/neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -312,7 +312,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'

-# Logging to confirm all the packages are installed
-RUN pip freeze
-
 #################### UNITTEST IMAGE #############################
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -1,41 +1,10 @@
 ARG BASE_UBI_IMAGE_TAG=9.5-1741850109

-###############################################################
-# Stage to build openblas
-###############################################################
-
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
-
-ARG MAX_JOBS
-ARG OPENBLAS_VERSION=0.3.29
-RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
-    && source /opt/rh/gcc-toolset-13/enable \
-    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
-    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
-    && cd OpenBLAS-$OPENBLAS_VERSION \
-    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
-    && cd /tmp && touch control
-
-
-###############################################################
-# base stage with dependencies coming from centos mirrors
-###############################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
-RUN  microdnf install -y dnf && \ 
-     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
-        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
-        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
-        dnf config-manager --set-enabled crb
-
-RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
-    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
-
-
 ###############################################################
 # base stage with basic dependencies
 ###############################################################

-FROM centos-deps-builder AS base-builder
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder

 ARG PYTHON_VERSION=3.12
 ARG OPENBLAS_VERSION=0.3.29
@ -51,27 +20,25 @@ ENV UV_LINK_MODE=copy
 # Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
 # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 #       when `--jobs=<N>` is passed with podman build command
-
-COPY --from=openblas-builder /tmp/control /dev/null
-
-RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
-    dnf install -y openssl-devel \
+RUN microdnf install -y openssl-devel dnf \
+    && dnf install -y  https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
+    && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
    && dnf install -y \
-       git tar gcc-toolset-13 automake libtool \
+       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
-       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
-       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
+       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
+       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
+       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
    && dnf clean all \
-    && PREFIX=/usr/local make -C /openblas install \
    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
    && python -m pip install -U pip uv \
    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
    && cd /tmp && touch control

-
 ###############################################################
 # Stage to build torch family
 ###############################################################
@ -81,8 +48,6 @@ FROM base-builder AS torch-builder
 ARG MAX_JOBS
 ARG TORCH_VERSION=2.6.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
-ARG OPENBLAS_VERSION=0.3.29
-
 RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable &&  \
    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
@ -144,8 +109,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        .. && \
    make install -j ${MAX_JOBS:-$(nproc)} && \
    cd ../../python/ && \
-    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
-    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
+    uv pip install -v -r requirements-wheel-build.txt && \
    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
    python setup.py build_ext \
    --build-type=release --bundle-arrow-cpp \
@ -168,25 +132,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    cd opencv-python && \
    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
-    uv pip install scikit-build && \    
    python -m build --wheel --installer=uv --outdir /opencvwheels/

-###############################################################
-# Stage to build numactl
-###############################################################
-
-FROM base-builder AS numa-builder
-
-# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
-# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
-ARG MAX_JOBS
-ARG NUMACTL_VERSION=2.0.19
-RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
-    && cd numactl \
-    && autoreconf -i && ./configure \
-    && make -j ${MAX_JOBS:-$(nproc)}
-
-
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@ -198,7 +145,6 @@ FROM base-builder AS vllmcache-builder
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
-COPY --from=numa-builder /tmp/control /dev/null

 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@ -214,13 +160,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
-    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
    --mount=type=bind,src=.,dst=/src/,rw \
    source /opt/rh/gcc-toolset-13/enable && \
    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
-    make -C /numactl install && \
    # sentencepiece.pc is in some pkgconfig inside uv cache
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@ -229,6 +173,21 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install /vllmwheel/*.whl


+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
 ###############################################################
 # Stage to build lapack
 ###############################################################
@ -258,7 +217,6 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
 ENV UV_LINK_MODE=copy
-ENV OMP_NUM_THREADS=16

 # create artificial dependencies between stages for independent stages to build in parallel
 COPY --from=torch-builder /tmp/control /dev/null
@ -267,13 +225,11 @@ COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
-COPY --from=openblas-builder /tmp/control /dev/null

 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
-    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
    microdnf install --nodocs -y \
    tar findutils openssl \
@ -285,8 +241,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    && microdnf clean all \
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
    && python -m pip install -U pip uv --no-cache \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && make -C /numactl install \
-    && PREFIX=/usr/local make -C /openblas install \
    && uv pip install 'cmake<4' \
    && cmake --install /lapack/build \
    && uv pip uninstall cmake
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Tyler Michael Smith	1236aebf0e	Merge remote-tracking branch 'origin/main' into fp8_ep_dp	2025-06-02 14:53:27 -04:00
Bill Nell	95c40f9b09	hacks Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-30 02:33:58 +00:00
Bill Nell	a0efd3106c	hack fix MoEConfig.quant_dtype Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-30 02:08:21 +00:00
Bill Nell	e69879996f	re-enable cudagraph+torch.compile Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-30 00:12:54 +00:00
Bill Nell	922165cba3	fp8 + pplx tests + fixes Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-29 21:25:33 +00:00
Bill Nell	12ea698498	pplx + fp8 test Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-29 18:50:37 +00:00
Bill Nell	caca0b718a	fixes Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-29 02:08:22 +00:00
Bill Nell	d86e3f0172	lint Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:56 +00:00
Bill Nell	3ca8322b74	lint Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:56 +00:00
Bill Nell	03b41b6cad	fix merge Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:56 +00:00
Bill Nell	cad6447664	fix Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:56 +00:00
Bill Nell	c169b05541	merge Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:56 +00:00
Bill Nell	468d16654a	cleanup quantization Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:53 +00:00
Bill Nell	909f234faa	stuff Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	f8510587c2	tests + fix Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	9cfebf51ba	basic working test Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	77f95b99a6	test Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	bbe888d033	wip Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	25ed6738d4	wip Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00
Bill Nell	e568e401da	fp8 support Signed-off-by: Bill Nell <bnell@redhat.com>	2025-05-28 23:40:27 +00:00