add code owner

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
add mixin
2025-09-17 22:19:17 -07:00 · 2025-09-17 22:15:10 -07:00 · 2025-09-17 22:12:41 -07:00 · 2025-09-17 22:02:15 -07:00
1825 changed files with 140594 additions and 164694 deletions
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -368,7 +368,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )
    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi
-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
      -m vllm.entrypoints.openai.api_server \
      $server_args"
    # run the server
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -0,0 +1,46 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
@ -150,16 +150,11 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      - "docker push vllm/vllm-openai:nightly-aarch64"
      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
      - "docker manifest push vllm/vllm-openai:nightly"
      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
@ -168,4 +163,3 @@ steps:
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -8,41 +8,20 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
-# Get DockerHub credentials from environment
+# Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 if [ -z "$DOCKERHUB_USERNAME" ]; then
    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
    exit 1
 fi
 # Get DockerHub bearer token
 echo "Getting DockerHub bearer token..."
 set +x
 BEARER_TOKEN=$(curl -s -X POST \
    -H "Content-Type: application/json" \
    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
    "https://hub.docker.com/v2/users/login" | jq -r '.token')
 set -x
 if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
    echo "Error: Failed to get DockerHub bearer token"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
-        set +x
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        set -x
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -64,9 +43,7 @@ delete_tag() {
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
    set -x
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 fi
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
@ -163,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 #Obsolete currently
 ##ignore certain Entrypoints/llm tests
 #if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 #  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 #fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -58,8 +58,11 @@ function cpu_tests() {
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    pytest -x -v -s tests/models/language/generation -m cpu_model
+    # Note: disable Bart until supports V1
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
 #!/bin/bash
 # This script build the Ascend NPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Base ubuntu image with basic ascend development libraries and python installed
 VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
 CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
 TEST_RUN_CONFIG_FILE="vllm_test.cfg"
 VLLM_ASCEND_TMP_DIR=
 # Get the test run configuration file from the vllm-ascend repository
 fetch_vllm_test_cfg() {
    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
    cleanup() {
        rm -rf "${VLLM_ASCEND_TMP_DIR}"
    }
    trap cleanup EXIT
    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
        exit 1
    fi
    # If the file already exists locally, just overwrite it
    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
    rm -rf "${VLLM_ASCEND_TMP_DIR}"
    trap - EXIT
 }
 # Downloads test run configuration file from a remote URL.
 # Loads the configuration into the current script environment.
 get_config() {
    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
 }
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
 if [ $? -ne 0 ]; then
  exit 1
 fi
 image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
 container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
 agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
 mkdir -p ${builder_cache_dir}
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
    apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 # Install for pytest to make the docker build cache layer always valid
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install pytest>=6.0  modelscope
 WORKDIR /workspace/vllm
 # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements/common.txt
 COPY . .
 # Install vLLM
 RUN --mount=type=cache,target=/root/.cache/pip \
    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton
 # Install vllm-ascend
 WORKDIR /workspace
 ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
 ARG VLLM_ASCEND_TAG=main
 RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
 # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r /workspace/vllm-ascend/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 ENV VLLM_USE_MODELSCOPE=True
 WORKDIR /workspace/vllm-ascend
 CMD ["/bin/bash"]
 EOF
 # Setup cleanup
 remove_docker_container() {
  docker rm -f "${container_name}" || true;
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
 #   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
        index="${BASH_REMATCH[1]}"
        cards_num="${BASH_REMATCH[2]}"
    else
        echo "parse error" >&2
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
 devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
 mkdir -p ${model_cache_dir}
 docker run \
    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    bash -c '
    set -e
    pytest -v -s tests/e2e/vllm_interface/
 '
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,15 +35,16 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
    pytest -v -s v1/test_utils.py
    pytest -v -s v1/test_metrics_reader.py
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -1,59 +0,0 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Setup script for Prime-RL integration tests
 # This script prepares the environment for running Prime-RL tests with nightly vLLM
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
 if [ -d "${PRIME_RL_DIR}" ]; then
    echo "Removing existing Prime-RL directory..."
    rm -rf "${PRIME_RL_DIR}"
 fi
 # Install UV if not available
 if ! command -v uv &> /dev/null; then
    echo "Installing UV package manager..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    source $HOME/.local/bin/env
 fi
 # Clone Prime-RL repository at specific branch for reproducible tests
 PRIME_RL_BRANCH="integ-vllm-main"
 echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 cd "${PRIME_RL_DIR}"
 echo "Setting up UV project environment..."
 export UV_PROJECT_ENVIRONMENT=/usr/local
 ln -s /usr/bin/python3 /usr/local/bin/python
 # Remove vllm pin from pyproject.toml
 echo "Removing vllm pin from pyproject.toml..."
 sed -i '/vllm==/d' pyproject.toml
 # Sync Prime-RL dependencies
 echo "Installing Prime-RL dependencies..."
 uv sync --inexact && uv sync --inexact --all-extras
 # Verify installation
 echo "Verifying installations..."
 uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 echo "Prime-RL integration test environment setup complete!"
 echo "Running Prime-RL integration tests..."
 export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 uv run pytest -vs tests/integration/test_rl.py -m gpu
 echo "Prime-RL integration tests completed!"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,28 +6,24 @@
 # to generate the final pipeline yaml file.
 # Documentation
-# label(str): the name of the test. emojis allowed.
+# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on the fastcheck pipeline only
+# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for the test. incompatible with command.
+# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on the first host, the second
+#     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
-# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
 # When adding a test
-# - If the test belongs to an existing group, add it there
+# - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -50,28 +46,23 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
-  - tests/multimodal
+  - tests/async_engine
  - tests/utils_
  commands:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
 - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/utils_
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s multimodal
-  - pytest -v -s transformers_utils
+  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker
  - pytest -v -s transformers_utils # transformers_utils
 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -91,12 +82,14 @@ steps:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Entrypoints Unit Tests # 5min
  timeout_in_minutes: 10
@ -121,9 +114,10 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -161,6 +155,7 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -168,34 +163,28 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
+  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
  - tests/v1/test_internal_lb_dp.py
  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
-  # test with torchrun tp=2 and external_dp=2
+  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=2 and pp=2
+  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=4 and dp=1
  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2, pp=2 and dp=1
  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=1 and dp=4 with ep
  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2 and dp=2 with ep
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -296,34 +285,23 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/metrics
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_metrics_reader.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
    - tests/v1
  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'cpu_test' v1/metrics
 - label: Examples Test # 30min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@ -342,13 +320,12 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -397,7 +374,6 @@ steps:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
@ -477,23 +453,33 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba
- label: Model Executor Test # 23min
+- label: Tensorizer Test # 14min
-  timeout_in_minutes: 35
+  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
-  - vllm/model_executor
+  - vllm/model_executor/model_loader
-  - tests/model_executor
+  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
+    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Model Executor Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
  - tests/model_executor
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
@ -527,7 +513,7 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -555,17 +541,10 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
    - tests/mistral_tool_use
  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
+    - pytest -v -s tool_use
-
+    - pytest -v -s mistral_tool_use
 - label: OpenAI-Compatible Tool Use (CPU) # 5 mins
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
 #####  models test  #####
@ -605,19 +584,13 @@ steps:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_transformers.py models/test_registry.py
 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
  no_gpu: true
  commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
+    - pytest -v -s models/test_transformers.py \
                   models/test_registry.py \
                   models/test_utils.py \
                   models/test_vision.py
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
@ -787,13 +760,11 @@ steps:
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 - label: Blackwell Test # 38 min
  timeout_in_minutes: 60
@ -835,11 +806,11 @@ steps:
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- label: Blackwell GPT-OSS Eval
+- label: GPT-OSS Eval (Blackwell)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # run on nightlies
+  optional: true # disable while debugging
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@ -847,34 +818,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 75
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -918,58 +862,47 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
+- label: Distributed Tests (2 GPUs) # 110min
-  timeout_in_minutes: 90
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
-  - vllm/worker/worker_base.py
+  - vllm/model_executor/models/
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/compile/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
-  - tests/entrypoints/llm/test_collective_rpc.py
+  - vllm/compilation
-  - tests/v1/distributed
+  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
+  - vllm/v1/engine/
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 - label: Distributed Model Tests (2 GPUs) # 37min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s models/multimodal/generation/test_maverick.py
 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
@ -1103,16 +1036,3 @@ steps:
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -4,14 +4,19 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@ -23,22 +28,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/attention @LucasWilkinson
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/worker/kv_cache_initializer_mixin.py @heheda12345
 /vllm/v1/offloading @ApostaC
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
@ -47,6 +50,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
@ -55,35 +59,23 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector/nixl_integration @NickLucche 
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 # Transformers backend
 /vllm/model_executor/models/transformers.py @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
-/docs/mkdocs @hmellor
+/docs @hmellor
 /docs/**/*.yml @hmellor
 /requirements/docs.txt @hmellor
 .readthedocs.yaml @hmellor
 mkdocs.yaml @hmellor
 # Linting
 .markdownlint.yaml @hmellor
 .pre-commit-config.yaml @hmellor
 /tools/pre_commit @hmellor
 # CPU
-/vllm/v1/worker/cpu* @bigPYJ1151
+/vllm/v1/worker/^cpu @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151
 # Intel GPU
-/vllm/v1/worker/xpu* @jikunshang
+/vllm/v1/worker/^xpu @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -43,6 +43,10 @@ body:
      Any other things you would like to mention.
  validations:
    required: false
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -15,7 +14,6 @@ pull_request_rules:
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -32,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -49,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -59,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -75,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -89,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -101,7 +94,6 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -115,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -130,7 +121,6 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -152,7 +142,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -173,7 +162,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -183,7 +171,7 @@ pull_request_rules:
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
@ -193,7 +181,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -209,7 +196,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -222,7 +208,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -238,7 +223,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -253,9 +237,9 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
    - label != stale
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
@ -272,9 +256,8 @@ pull_request_rules:
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
+      - conflict
-    - conflict
+      - -closed
    - -closed
  actions:
    label:
      add:
@ -288,12 +271,10 @@ pull_request_rules:
 - name: assign reviewer for tensorizer changes
  conditions:
    - label != stale
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/model_executor/model_loader/tensorizer_loader/
+      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
@ -301,7 +282,6 @@ pull_request_rules:
 - name: assign reviewer for modelopt changes
  conditions:
    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -316,27 +296,9 @@ pull_request_rules:
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
+      - -conflict
-    - -closed
+      - -closed
  actions:
    label:
      remove:
        - needs-rebase
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
      - files~=^examples/others/lmcache/
      - files~=^tests/v1/kv_connector/
      - files~=^vllm/distributed/kv_transfer/
      - title~=(?i)\bP/?D\b
      - title~=(?i)NIXL
      - title~=(?i)LMCache
  actions:
    label:
      add:
        - kv-connector
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,16 +6,28 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
+- repo: https://github.com/google/yapf
-  rev: v0.13.3
+  rev: v0.43.0
  hooks:
-  - id: ruff-check
+  - id: yapf
    args: [--in-place, --verbose]
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
  rev: v1.35.5
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v20.1.3
  hooks:
@ -37,7 +49,7 @@ repos:
  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -48,32 +60,38 @@ repos:
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
    stages: [pre-commit] # Don't run in CI
    <<: &mypy_common
      language: python
      types_or: [python, pyi]
      require_serial: true
      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
-    entry: python tools/pre_commit/mypy.py 1 "3.9"
+    entry: tools/mypy.sh 1 "3.9"
-    <<: *mypy_common
+    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
-    entry: python tools/pre_commit/mypy.py 1 "3.10"
+    entry: tools/mypy.sh 1 "3.10"
-    <<: *mypy_common
+    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
-    entry: python tools/pre_commit/mypy.py 1 "3.11"
+    entry: tools/mypy.sh 1 "3.11"
-    <<: *mypy_common
+    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
-    entry: python tools/pre_commit/mypy.py 1 "3.12"
+    entry: tools/mypy.sh 1 "3.12"
-    <<: *mypy_common
+    language: python
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
@ -137,15 +155,18 @@ repos:
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+    entry: python tools/check_pickle_imports.py
    language: python
    types: [python]
-    additional_dependencies: [regex]
+    pass_filenames: false
    additional_dependencies: [pathspec, regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
    language: python
-    additional_dependencies: [regex]
+    types: [python]
    pass_filenames: true
    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -13,7 +13,6 @@ build:
 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
 # Optionally declare the Python requirements required to build your docs
 python:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
 elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 #
 # Set compression mode for CUDA >=13.x.
 #
 if(VLLM_GPU_LANG STREQUAL "CUDA" AND
   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
 endif()
 #
 # Set CUDA include flags for CXX compiler.
 #
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -317,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
    "csrc/quantization/fp8/per_token_group_quant.cu")
@ -451,11 +440,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
@ -485,11 +470,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -569,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -592,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # FP4 Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -618,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # CUTLASS MLA Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@ -666,11 +635,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
@ -691,11 +656,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
@ -714,11 +675,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
--- a/README.md
+++ b/README.md
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -103,15 +103,10 @@ start_server() {
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
    local server_pid=$!
    # wait for 10 minutes...
    server_started=0
    for i in {1..60}; do
        # This line checks whether the server is still alive or not,
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -123,7 +118,7 @@ start_server() {
    done
    if (( ! server_started )); then
-        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
        return 1
    else
        return 0
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
 from benchmark_utils import TimeCollector
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -1,31 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import time
 from unittest import mock
 import numpy as np
 from benchmark_utils import TimeCollector
 from tabulate import tabulate
-from vllm.config import (
+from benchmark_utils import TimeCollector
-    CacheConfig,
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
    DeviceConfig,
    LoadConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    SpeculativeConfig,
    VllmConfig,
 )
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-def benchmark_propose(args):
+def main(args):
    rows = []
    for max_ngram in args.max_ngram:
        collector = TimeCollector(TimeCollector.US)
@ -83,88 +69,10 @@ def benchmark_propose(args):
    )
 def benchmark_batched_propose(args):
    NUM_SPECULATIVE_TOKENS_NGRAM = 10
    PROMPT_LOOKUP_MIN = 5
    PROMPT_LOOKUP_MAX = 15
    MAX_MODEL_LEN = int(1e7)
    DEVICE = current_platform.device_type
    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
    speculative_config = SpeculativeConfig(
        target_model_config=model_config,
        target_parallel_config=ParallelConfig(),
        method="ngram",
        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
        prompt_lookup_max=PROMPT_LOOKUP_MAX,
        prompt_lookup_min=PROMPT_LOOKUP_MIN,
    )
    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=CacheConfig(),
        speculative_config=speculative_config,
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
        scheduler_config=SchedulerConfig(),
    )
    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
    mock_pp_group = mock.MagicMock()
    mock_pp_group.world_size = 1
    with mock.patch(
        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
    ):
        runner = GPUModelRunner(vllm_config, DEVICE)
        # hack max model len
        runner.max_model_len = MAX_MODEL_LEN
        runner.drafter.max_model_len = MAX_MODEL_LEN
        dummy_input_batch = InputBatch(
            max_num_reqs=args.num_req,
            max_model_len=MAX_MODEL_LEN,
            max_num_batched_tokens=args.num_req * args.num_token,
            device=DEVICE,
            pin_memory=False,
            vocab_size=256000,
            block_sizes=[16],
        )
        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
        dummy_input_batch.spec_decode_unsupported_reqs = ()
        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
        dummy_input_batch.token_ids_cpu = np.random.randint(
            0, 20, (args.num_req, args.num_token)
        )
        runner.input_batch = dummy_input_batch
        sampled_token_ids = [[0]] * args.num_req
        print("Starting benchmark")
        # first run is warmup so ignore it
        for _ in range(args.num_iteration):
            start = time.time()
            runner.drafter.propose(
                sampled_token_ids,
                dummy_input_batch.req_ids,
                dummy_input_batch.num_tokens_no_spec,
                dummy_input_batch.token_ids_cpu,
                dummy_input_batch.spec_decode_unsupported_reqs,
            )
            end = time.time()
            print(f"Iteration time (s): {end - start}")
 def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of N-gram speculative decode drafting"
    )
    parser.add_argument(
        "--batched", action="store_true", help="consider time to prepare batch"
    )
    parser.add_argument(
        "--num-iteration",
        type=int,
@ -197,17 +105,8 @@ def invoke_main() -> None:
        help="Number of speculative tokens to generate",
    )
    args = parser.parse_args()
-
+    main(args)
    if not args.batched:
        benchmark_propose(args)
    else:
        benchmark_batched_propose(args)
 """
 # Example command lines:
 # time python3 benchmarks/benchmark_ngram_proposer.py
 # time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
 """  # noqa: E501
 if __name__ == "__main__":
    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -37,13 +37,14 @@ from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from backend_request_func import (
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
@ -448,8 +449,7 @@ async def benchmark(
    def prepare_extra_body(request) -> dict:
        extra_body = {}
        # Add the schema to the extra_body
-        extra_body["structured_outputs"] = {}
+        extra_body[request.structure_type] = request.schema
        extra_body["structured_outputs"][request.structure_type] = request.schema
        return extra_body
    print("Starting initial single prompt test run...")
@ -696,11 +696,11 @@ def evaluate(ret, args):
        return re.match(args.regex, actual) is not None
    def _eval_correctness(expected, actual):
-        if args.structure_type == "json":
+        if args.structure_type == "guided_json":
            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "regex":
+        elif args.structure_type == "guided_regex":
            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "choice":
+        elif args.structure_type == "guided_choice":
            return _eval_correctness_choice(expected, actual)
        else:
            return None
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
    )
    if args.dataset == "grammar":
-        args.structure_type = "grammar"
+        args.structure_type = "guided_grammar"
    elif args.dataset == "regex":
-        args.structure_type = "regex"
+        args.structure_type = "guided_regex"
    elif args.dataset == "choice":
-        args.structure_type = "choice"
+        args.structure_type = "guided_choice"
    else:
-        args.structure_type = "json"
+        args.structure_type = "guided_json"
    if args.no_structured_output:
        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}so"
+        result_file_name = f"{args.structured_output_ratio}guided"
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
@ -909,13 +909,13 @@ def create_argument_parser():
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help="Name or path of the tokenizer, if not using the default tokenizer.",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv
@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@ -3,7 +3,6 @@
 import argparse
 import copy
 import itertools
 import os
 import torch
 from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
    "torch-bf16": dict(enabled=True),
    "nvfp4": dict(no_a_quant=False, enabled=True),
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 }
 _needs_fbgemm = any(
    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
 )
 if _needs_fbgemm:
    try:
        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
            triton_scale_nvfp4_quant,
        )
    except ImportError:
        print(
            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
            "These providers will be skipped. Please install fbgemm_gpu with: "
            "'pip install fbgemm-gpu-genai' to run them."
        )
        # Disable FBGEMM providers so the benchmark can run.
        for cfg in PROVIDER_CFGS.values():
            if cfg.get("fbgemm"):
                cfg["enabled"] = False
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str):
    # Compute global scale for weight
    b_amax = torch.abs(b).max().to(torch.float32)
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
-    if "fbgemm" in cfg and cfg["fbgemm"]:
+    b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
    else:
        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
    return b_fp4, scale_b_fp4, b_global_scale
 def build_nvfp4_runner(cfg, a, b, dtype, device):
-    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
+    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
    # Compute global scale for activation
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
    # Alpha for the GEMM operation
    alpha = 1.0 / (a_global_scale * b_global_scale)
    if "fbgemm" in cfg and cfg["fbgemm"]:
        if cfg["no_a_quant"]:
            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
            def run():
                return torch.ops.fbgemm.f4f4bf16(
                    a_fp4,
                    b_fp4,
                    scale_a_fp4,
                    scale_b_fp4,
                    global_scale=alpha,
                    use_mx=False,
                )
            return run
        else:
            def run():
                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
                return torch.ops.fbgemm.f4f4bf16(
                    a_fp4,
                    b_fp4,
                    scale_a_fp4,
                    scale_b_fp4,
                    global_scale=alpha,
                    use_mx=False,
                )
            return run
    if cfg["no_a_quant"]:
        # Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":
    for K, N, model in prepare_shapes(args):
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
        os.makedirs(save_dir, exist_ok=True)
        benchmark.run(
            print_data=True,
            show_plots=True,
-            save_path=save_dir,
+            save_path=f"bench_nvfp4_res_n{N}_k{K}",
            N=N,
            K=K,
        )
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -51,7 +51,7 @@ def calculate_diff(
 ):
    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
+    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
@ -59,25 +59,23 @@ def calculate_diff(
    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
-    try:
+    out_allclose = lambda o1, o2: torch.allclose(
-        torch.testing.assert_close(
+        o1.to(torch.float32),
-            cuda_out.to(torch.float32),
+        o2.to(torch.float32),
-            torch_out.to(torch.float32),
+        rtol=1e-3,
-            rtol=1e-3,
+        atol=1e-5,
-            atol=1e-5,
+    )
-        )
+    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
-        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+
-        torch.testing.assert_close(
+    if (
-            cuda_out.to(torch.float32),
+        out_allclose(cuda_out, torch_out)
-            torch_eager_out.to(torch.float32),
+        and scale_allclose(cuda_scale, torch_scale)
-            rtol=1e-3,
+        and out_allclose(cuda_out, torch_eager_out)
-            atol=1e-5,
+        and scale_allclose(cuda_scale, torch_eager_scale)
-        )
+    ):
        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
        print("✅ All implementations match")
-    except AssertionError as e:
+    else:
        print("❌ Implementations differ")
        print(e)
 configs = []
@ -93,7 +91,7 @@ def benchmark_quantization(
 ):
    device = torch.device("cuda")
-    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
+    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
    quantiles = [0.5, 0.2, 0.8]
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -159,21 +157,21 @@ if __name__ == "__main__":
    )
    parser.add_argument("-c", "--check", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
    )
    parser.add_argument(
        "--hidden-sizes",
        type=int,
        nargs="+",
-        default=[896, 1024, 2048, 4096, 7168],
+        default=None,
-        help="Hidden sizes to benchmark",
+        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
-        default=[1, 16, 128, 512, 1024],
+        default=None,
-        help="Batch sizes to benchmark",
+        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
    )
    parser.add_argument(
        "--group-sizes",
@ -194,8 +192,8 @@ if __name__ == "__main__":
    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
-    hidden_sizes = args.hidden_sizes
+    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-    batch_sizes = args.batch_sizes
+    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
    if args.group_sizes is not None:
        group_shapes = []
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -1,406 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
 kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
 but use different quantization strategies and backends.
 """
 import nvtx
 import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
 WEIGHT_SHAPES_MOE = {
    "mixtral-8x7b": [
        [8, 2, 4096, 14336],
    ],
    "deepseek-v2": [
        [160, 6, 5120, 12288],
    ],
    "custom-small": [
        [8, 2, 2048, 7168],
    ],
    "glm45-fp8": [
        [128, 8, 4096, 1408],
    ],
    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
        [128, 1, 5120, 8192],
    ],
 }
 DEFAULT_MODELS = [
    "mixtral-8x7b",
 ]
 DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 DEFAULT_TP_SIZES = [1]
 PER_ACT_TOKEN_OPTS = [False, True]
 PER_OUT_CH_OPTS = [False, True]
 FP8_DTYPE = current_platform.fp8_dtype()
 def bench_run(
    results: list,
    model: str,
    num_experts: int,
    topk: int,
    per_act_token: bool,
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
    (m, k, n) = mkn
    dtype = torch.half
    device = "cuda"
    # Create input activations
    a = torch.randn((m, k), device=device, dtype=dtype) / 10
    # Create weights
    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
    # Create FP8 quantized weights and scales for both kernels
    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
    # Create scales based on quantization strategy
    if per_out_ch:
        # Per-channel quantization
        w1_scale = torch.empty(
            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
        )
        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
    else:
        # Per-tensor quantization
        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
    # Quantize weights
    for expert in range(num_experts):
        if per_out_ch:
            # Per-channel quantization - not yet implemented properly
            # For now, fall back to per-tensor quantization
            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
            # Expand scalar scales to the expected per-channel shape
            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
            w2_scale[expert] = w2_scale_temp.expand(k, 1)
        else:
            # Per-tensor quantization
            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
            # Store scalar scales in [1, 1] tensors
            w1_scale[expert, 0, 0] = w1_scale_temp
            w2_scale[expert, 0, 0] = w2_scale_temp
    # Prepare weights for CUTLASS (no transpose needed)
    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
    # Create router scores and get topk
    score = torch.randn((m, num_experts), device=device, dtype=dtype)
    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
    # Force per-tensor quantization for all cases to match working e2e setup
    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
    # Force per-tensor quantization for all cases
    per_act_token = False
    # Create stride tensors for CUTLASS
    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
    def run_triton_moe(
        a: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
        a1_scale: torch.Tensor,
        a2_scale: torch.Tensor,
        num_repeats: int,
    ):
        quant_config = fp8_w8a8_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            per_act_token_quant=per_act_token,
            per_out_ch_quant=per_out_ch,
        )
        for _ in range(num_repeats):
            fused_experts(
                a,
                w1,
                w2,
                topk_weights,
                topk_ids,
                quant_config=quant_config,
            )
    def run_cutlass_moe_fp8(
        a: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        ab_strides1: torch.Tensor,
        ab_strides2: torch.Tensor,
        c_strides1: torch.Tensor,
        c_strides2: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
        a1_scale: torch.Tensor,
        a2_scale: torch.Tensor,
        num_repeats: int,
    ):
        quant_config = fp8_w8a8_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            per_act_token_quant=per_act_token,
            per_out_ch_quant=per_out_ch,
        )
        for _ in range(num_repeats):
            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
                cutlass_moe_fp8(
                    a=a,
                    w1_q=w1,
                    w2_q=w2,
                    topk_weights=topk_weights,
                    topk_ids=topk_ids,
                    ab_strides1=ab_strides1,
                    ab_strides2=ab_strides2,
                    c_strides1=c_strides1,
                    c_strides2=c_strides2,
                    quant_config=quant_config,
                    activation="silu",
                    global_num_experts=num_experts,
                )
    # Pre-create quantization config to avoid creating it inside CUDA graph
    quant_config = fp8_w8a8_moe_quant_config(
        w1_scale=w1_scale,
        w2_scale=w2_scale,
        a1_scale=a1_scale,
        a2_scale=a2_scale,
        per_act_token_quant=per_act_token,
        per_out_ch_quant=per_out_ch,
    )
    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
        # Capture 10 invocations like benchmark_moe.py
        for _ in range(10):
            cutlass_moe_fp8(
                a=a,
                w1_q=w1_fp8q_cutlass,
                w2_q=w2_fp8q_cutlass,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                ab_strides1=ab_strides1,
                ab_strides2=ab_strides2,
                c_strides1=c_strides1,
                c_strides2=c_strides2,
                quant_config=quant_config,
                activation="silu",
                global_num_experts=num_experts,
            )
    torch.cuda.synchronize()
    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(triton_graph, stream=triton_stream):
        # Capture 10 invocations like benchmark_moe.py
        for _ in range(10):
            fused_experts(
                a,
                w1_fp8q,
                w2_fp8q,
                topk_weights,
                topk_ids,
                quant_config=quant_config,
            )
    torch.cuda.synchronize()
    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
        torch.cuda.synchronize()
        # Timing
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        latencies = []
        for _ in range(num_iters):
            torch.cuda.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
            end_event.synchronize()
            latencies.append(start_event.elapsed_time(end_event))
        # Divide by 10 since graph contains 10 calls
        return sum(latencies) / (num_iters * 10)
    # Benchmark parameters
    num_warmup = 5
    num_iters = 100
    # Benchmark only CUDA graphs (more reliable and faster)
    # Benchmark Triton MoE with CUDA graphs
    triton_graph_time = bench_cuda_graph(
        triton_graph, num_warmup=num_warmup, num_iters=num_iters
    )
    # Benchmark CUTLASS MoE with CUDA graphs
    cutlass_graph_time = bench_cuda_graph(
        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
    )
    # Convert ms to us and return results
    triton_time_us = triton_graph_time * 1000
    cutlass_time_us = cutlass_graph_time * 1000
    return {
        "batch_size": m,
        "triton_time_us": triton_time_us,
        "cutlass_time_us": cutlass_time_us,
    }
 def main(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
    all_results = []
    for model in args.models:
        for tp in args.tp_sizes:
            for layer in WEIGHT_SHAPES_MOE[model]:
                num_experts = layer[0]
                topk = layer[1]
                size_k = layer[2]
                size_n = layer[3] // tp
                if len(args.limit_k) > 0 and size_k not in args.limit_k:
                    continue
                if len(args.limit_n) > 0 and size_n not in args.limit_n:
                    continue
                for per_act_token in args.per_act_token_opts:
                    for per_out_ch in args.per_out_ch_opts:
                        print(
                            f"\n=== {model}, experts={num_experts}, topk={topk},"
                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
                        )
                        config_results = []
                        for size_m in args.batch_sizes:
                            mkn = (size_m, size_k, size_n)
                            result = bench_run(
                                [],  # Not used anymore
                                model,
                                num_experts,
                                topk,
                                per_act_token,
                                per_out_ch,
                                mkn,
                            )
                            if result:
                                config_results.append(result)
                        # Print results table for this configuration
                        if config_results:
                            print(
                                f"\n{'Batch Size':<12}"
                                f"{'Triton (us)':<15}"
                                f"{'CUTLASS (us)':<15}"
                            )
                            print("-" * 45)
                            for result in config_results:
                                print(
                                    f"{result['batch_size']:<12}"
                                    f"{result['triton_time_us']:<15.2f}"
                                    f"{result['cutlass_time_us']:<15.2f}"
                                )
                            all_results.extend(config_results)
    print(f"\nTotal benchmarks completed: {len(all_results)}")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
         across specified models/shapes/batches
        Example usage:
        python benchmark_cutlass_moe_fp8.py  \
            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
            --tp-sizes 8 \
            --batch-size 2 4 8  \
            --per-act-token-opts false \
            --per-out-ch-opts false
        """
    )
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=DEFAULT_MODELS,
        choices=WEIGHT_SHAPES_MOE.keys(),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
    parser.add_argument(
        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
    )
    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
    parser.add_argument(
        "--per-act-token-opts",
        nargs="+",
        type=lambda x: x.lower() == "true",
        default=[False, True],
        help="Per-activation token quantization options (true/false)",
    )
    parser.add_argument(
        "--per-out-ch-opts",
        nargs="+",
        type=lambda x: x.lower() == "true",
        default=[False, True],
        help="Per-output channel quantization options (true/false)",
    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,10 +7,6 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).
 for NCCL symmetric memory you need to set the environment variables
 NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
 not use fast NVLS implementation for all reduce.
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
@ -30,13 +26,7 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import (
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
    PyNcclCommunicator,
    register_nccl_symmetric_ops,
 )
 from vllm.distributed.device_communicators.pynccl_allocator import (
    set_graph_pool_id,
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
@ -108,7 +98,6 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
            communicators.append(
                (
                    "pynccl-symm",
                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
                    None,  # no env variable needed
                )
            )
        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    graph_pool = torch.cuda.graph_pool_handle()
+                    with torch.cuda.graph(graph):
                    set_graph_pool_id(graph_pool)
                    with torch.cuda.graph(graph, pool=graph_pool):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
 def make_rand_tensors(
-    a_shape: tuple[int, ...],
+    a_shape: tuple[int],
-    b_shape: tuple[int, ...],
+    b_shape: tuple[int],
-    c_shape: tuple[int, ...],
+    c_shape: tuple[int],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
@ -243,7 +243,7 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -584,9 +584,8 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
    elif config.architectures[0] in (
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
-        "DeepseekV32ForCausalLM",
+        "DeepseekV2ForCausalLM",
        "Glm4MoeForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,174 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
 import torch
 from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
 logger = init_logger(__name__)
@torch.inference_mode()
 def run_benchmark(
    num_tokens: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
    kv_cache_dtype: str,
    num_iters: int,
    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
    current_platform.seed_everything(42)
    torch.set_default_device(device)
    # create random key / value tensors [T, H, D].
    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
    value = torch.randn_like(key)
    # prepare the slot mapping.
    # each token is assigned a unique slot in the KV-cache.
    num_slots = block_size * num_blocks
    if num_tokens > num_slots:
        raise ValueError("num_tokens cannot exceed the total number of cache slots")
    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
    key_caches, value_caches = create_kv_caches_with_random(
        num_blocks,
        block_size,
        1,  # num_layers
        num_heads,
        head_size,
        kv_cache_dtype,
        dtype,
        device=device,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
    # to free unused memory
    del key_caches, value_caches
    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)
    function_under_test = lambda: ops.reshape_and_cache(
        key,  # noqa: F821
        value,  # noqa: F821
        key_cache,  # noqa: F821
        value_cache,  # noqa: F821
        slot_mapping,  # noqa: F821
        kv_cache_dtype,
        k_scale,
        v_scale,
    )
    if benchmark_mode == "cudagraph":
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
        torch.cuda.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
    # warm-up
    run_cuda_benchmark(3)
    lat = run_cuda_benchmark(num_iters)
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
    torch.cuda.empty_cache()
    return lat
 def main(args):
    rows = []
    for exp in range(1, 17):
        n_tok = 2**exp
        lat = run_benchmark(
            num_tokens=n_tok,
            num_heads=args.num_heads,
            head_size=args.head_size,
            block_size=args.block_size,
            num_blocks=args.num_blocks,
            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
            kv_cache_dtype=args.kv_cache_dtype,
            num_iters=args.iters,
            benchmark_mode=args.mode,
            device="cuda",
        )
        rows.append([n_tok, lat * 1e6])  # convert to microseconds
    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser.add_argument("--num-heads", type=int, default=128)
    parser.add_argument(
        "--head-size",
        type=int,
        choices=[64, 80, 96, 112, 120, 128, 192, 256],
        default=128,
    )
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--num-blocks", type=int, default=128 * 128)
    parser.add_argument(
        "--dtype",
        type=str,
        choices=["half", "bfloat16", "float"],
        default="bfloat16",
    )
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
        choices=["auto", "fp8"],
        default="auto",
    )
    parser.add_argument("--iters", type=int, default=200)
    parser.add_argument(
        "--mode",
        type=str,
        choices=["cudagraph", "no_graph"],
        default="cudagraph",
    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -9,9 +9,6 @@ import torch
 from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.attention.ops.triton_reshape_and_cache_flash import (
    triton_reshape_and_cache_flash,
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (
@ -34,8 +31,6 @@ def run_benchmark(
    kv_cache_dtype: str,
    kv_cache_layout: str,
    num_iters: int,
    implementation: str,
    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
@ -43,14 +38,6 @@ def run_benchmark(
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
    if implementation not in ("cuda", "triton"):
        raise ValueError(
            f"Unsupported implementation: {implementation}. "
            "Only 'cuda' and 'triton' are supported."
        )
    if implementation == "triton" and kv_cache_layout == "HND":
        return float("nan")  # Triton does not support HND layout yet.
    current_platform.seed_everything(42)
    torch.set_default_device(device)
@ -78,49 +65,27 @@ def run_benchmark(
        cache_layout=kv_cache_layout,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
    # to free unused memory
    del key_caches, value_caches
    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)
    if implementation == "cuda":
        function_under_test = lambda: ops.reshape_and_cache_flash(
            key,  # noqa: F821
            value,  # noqa: F821
            key_cache,  # noqa: F821
            value_cache,  # noqa: F821
            slot_mapping,  # noqa: F821
            kv_cache_dtype,
            k_scale,
            v_scale,
        )
    else:
        function_under_test = lambda: triton_reshape_and_cache_flash(
            key,  # noqa: F821
            value,  # noqa: F821
            key_cache,  # noqa: F821
            value_cache,  # noqa: F821
            slot_mapping,  # noqa: F821
            kv_cache_dtype,
            k_scale,
            v_scale,
        )
    if benchmark_mode == "cudagraph":
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
        torch.cuda.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
-            function_under_test()
+            ops.reshape_and_cache_flash(
-            torch.cuda.synchronize()
+                key,
                value,
                key_cache,
                value_cache,
                slot_mapping,
                kv_cache_dtype,
                k_scale,
                v_scale,
            )
        torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@ -151,16 +116,10 @@ def main(args):
                kv_cache_dtype=args.kv_cache_dtype,
                kv_cache_layout=layout,
                num_iters=args.iters,
                implementation=args.implementation,
                benchmark_mode=args.mode,
                device="cuda",
            )
            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
    print(
        f"Benchmark results for implementation {args.implementation}"
        f" (measuring with {args.mode}):"
    )
    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@ -192,21 +151,6 @@ if __name__ == "__main__":
    )
    parser.add_argument("--iters", type=int, default=100)
    parser.add_argument(
        "--implementation",
        type=str,
        choices=["cuda", "triton"],
        default="cuda",
    )
    parser.add_argument(
        "--mode",
        type=str,
        choices=["cudagraph", "no_graph"],
        default="cudagraph",
    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,13 +11,13 @@ from datetime import datetime
 from typing import Any
 import torch
 import triton
 from tqdm import tqdm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser
 mp.set_start_method("spawn", force=True)
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
@ -7,33 +8,27 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import (
+from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
    calc_diff,
    fp8_gemm_nt,
    get_col_major_tma_aligned_tensor,
    per_block_cast_to_fp8,
 )
-def benchmark_shape(
+def benchmark_shape(m: int,
-    m: int,
+                    n: int,
-    n: int,
+                    k: int,
-    k: int,
+                    warmup: int = 100,
-    warmup: int = 100,
+                    repeat: int = 10000,
-    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
    verbose: bool = False,
 ) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
    # Create test tensors
-    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
-    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
    # Reference result in BF16
    torch.cuda.synchronize()
@ -50,39 +45,34 @@ def benchmark_shape(
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        A, block_size[1], column_major_scales=True
+        A, block_size[1], column_major_scales=True)
    )
    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        fp8_gemm_nt(
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
-            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+                                       (B_deepgemm, B_scale_deepgemm),
-        )
+                                       C_deepgemm)
        return C_deepgemm
    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(
+        return w8a8_block_fp8_matmul(A_vllm,
-            A_vllm,
+                                     B_vllm,
-            B_vllm,
+                                     A_scale_vllm,
-            A_scale_vllm,
+                                     B_scale_vllm,
-            B_scale_vllm,
+                                     block_size,
-            block_size,
+                                     output_dtype=torch.bfloat16)
            output_dtype=torch.bfloat16,
        )
    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        return ops.cutlass_scaled_mm(
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
-            A_vllm_cutlass,
+                                     B_vllm.T,
-            B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
-            scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
-            scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
            out_dtype=torch.bfloat16,
        )
    # Run correctness check first
    if verbose:
@ -99,23 +89,26 @@ def benchmark_shape(
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
-        print(
+        print("vLLM Triton vs DeepGEMM difference: "
-            "vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
-            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        print("vLLM CUTLASS vs DeepGEMM difference: "
-        )
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
        print(
            "vLLM CUTLASS vs DeepGEMM difference: "
            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
        )
    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
-        "vLLM CUTLASS": vllm_cutlass_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
    }
-    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
+    benchmark_results = {
        "shape": {
            "m": m,
            "n": n,
            "k": k
        },
        "implementations": {}
    }
    for name, func in implementations.items():
        # Warmup
@ -143,36 +136,38 @@ def benchmark_shape(
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
-                "DeepGEMM": 0.0
+                "DeepGEMM":
-                if name == "DeepGEMM"
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
-                else calc_diff(func(), C_deepgemm),
+                "Reference":
-                "Reference": deepgemm_diff
+                deepgemm_diff if name == "DeepGEMM" else
-                if name == "DeepGEMM"
+                (vllm_triton_diff
-                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
-            },
+            }
        }
        if verbose:
-            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
+            print(
                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
            )
    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
-            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
+            benchmark_results["implementations"][name][
                "speedup_vs_deepgemm"] = speedup
            if verbose:
-                print(
+                print(f"DeepGEMM is {1/speedup:.2f}x "
-                    f"DeepGEMM is {1 / speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
                )
-    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
-    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
+        "time_ms"]
    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
        "time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
-    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+    benchmark_results["implementations"]["vLLM CUTLASS"][
-        cutlass_vs_triton
+        "speedup_vs_triton"] = cutlass_vs_triton
    )
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -184,7 +179,8 @@ def benchmark_shape(
 def format_table_row(values, widths):
    """Format a row with specified column widths."""
-    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
+    return "| " + " | ".join(f"{val:{w}}"
                             for val, w in zip(values, widths)) + " |"
 def print_table(headers, rows, title=None):
@ -292,50 +288,38 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
-        deepgemm_rows.append(
+        deepgemm_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
-                shape["n"],
+        ])
                shape["k"],
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
            ]
        )
-    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
+    print_table(deepgemm_headers,
                deepgemm_rows,
                title="DeepGEMM Implementation:")
    # Print vLLM Triton table
-    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
+    triton_headers = [
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
    ]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
-        triton_rows.append(
+        triton_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(speedup)
-                shape["k"],
+        ])
                f"{impl_data['time_us']:.1f}",
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(speedup),
            ]
        )
-    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
+    print_table(triton_headers,
                triton_rows,
                title="vLLM Triton Implementation:")
    # Print vLLM CUTLASS table
    cutlass_headers = [
-        "m",
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
-        "n",
+        "vs Triton"
        "k",
        "Time (μs)",
        "TFLOPS",
        "GB/s",
        "vs DeepGEMM",
        "vs Triton",
    ]
    cutlass_rows = []
    for result in all_results:
@ -343,27 +327,28 @@ def run_benchmarks(verbose: bool = False):
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
-        cutlass_rows.append(
+        cutlass_rows.append([
-            [
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
-                shape["m"],
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
-                shape["n"],
+            format_speedup(vs_deepgemm),
-                shape["k"],
+            format_speedup(vs_triton)
-                f"{impl_data['time_us']:.1f}",
+        ])
                f"{impl_data['tflops']:.1f}",
                f"{impl_data['gb_s']:.1f}",
                format_speedup(vs_deepgemm),
                format_speedup(vs_triton),
            ]
        )
-    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
+    print_table(cutlass_headers,
                cutlass_rows,
                title="vLLM CUTLASS Implementation:")
    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")
    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
-        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
+        impl: {
            "tflops": 0,
            "gb_s": 0,
            "time_ms": 0
        }
        for impl in implementations
    }
    for result in all_results:
@ -381,9 +366,9 @@ def run_benchmarks(verbose: bool = False):
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
-        avg_rows.append(
+        avg_rows.append([
-            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
-        )
+        ])
    print_table(avg_headers, avg_rows)
@ -391,19 +376,21 @@ def run_benchmarks(verbose: bool = False):
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
-        "vLLM CUTLASS vs vLLM Triton": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
    }
    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
-        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
            "time_ms"]
-        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
-        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
-        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+        avg_speedups[
-            vllm_triton_time / vllm_cutlass_time
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
-        )
+        avg_speedups[
            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
@ -421,7 +408,8 @@ def run_benchmarks(verbose: bool = False):
    for result in all_results:
        for impl in implementations:
-            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
+            avg_diff[impl] += result["implementations"][impl]["diff"][
                "Reference"]
    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@ -0,0 +1,49 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.lint.isort]
 known-first-party = ["vllm"]
 [tool.ruff.format]
 docstring-code-format = true
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -101,7 +101,6 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
 endif()
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -178,14 +177,8 @@ elseif (S390_FOUND)
        "-mzvector"
        "-march=native"
        "-mtune=native")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
    if(RVV_FOUND)
 	    message(FAIL_ERROR "Can't support rvv now.")
    else()
        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
    endif()
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
 endif()
 #
@ -213,7 +206,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -227,7 +219,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "ON")
+    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
    FetchContent_MakeAvailable(oneDNN)
@ -266,8 +258,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
+    "csrc/cpu/torch_bindings.cpp")
    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
 else()
  FetchContent_Declare(
        flashmla
-        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -33,64 +33,23 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
-
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
-set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
    list(APPEND SUPPORT_ARCHS 9.0a)
 endif()
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
    list(APPEND SUPPORT_ARCHS 10.0a)
 endif()
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
    set(FlashMLA_SOURCES
-        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
    )
    set(FlashMLA_Extension_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
    )
    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+        ${flashmla_SOURCE_DIR}/csrc)
    )
    set(FlashMLA_Extension_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
    )
    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_Extension_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
    define_gpu_extension_target(
        _flashmla_C
        DESTINATION vllm
@ -101,32 +60,8 @@ if(FLASH_MLA_ARCHS)
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
    target_compile_options(_flashmla_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
    define_gpu_extension_target(
        _flashmla_extension_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
        SOURCES ${FlashMLA_Extension_SOURCES}
        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
        ARCHITECTURES ${VLLM_GPU_ARCHES}
        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
    target_compile_options(_flashmla_extension_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    # Create an empty target for setup.py when not targeting sm90a systems
    add_custom_target(_flashmla_C)
    add_custom_target(_flashmla_extension_C)
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -16,7 +16,7 @@ import shutil
 from torch.utils.hipify.hipify_python import hipify
-if __name__ == "__main__":
+if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Project directory where all the source + include files live.
@ -34,14 +34,15 @@ if __name__ == "__main__":
    )
    # Source files to convert.
-    parser.add_argument(
+    parser.add_argument("sources",
-        "sources", help="Source files to hipify.", nargs="*", default=[]
+                        help="Source files to hipify.",
-    )
+                        nargs="*",
                        default=[])
    args = parser.parse_args()
    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, "*")]
+    includes = [os.path.join(args.project_dir, '*')]
    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
@ -50,31 +51,25 @@ if __name__ == "__main__":
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
-    hipify_result = hipify(
+    hipify_result = hipify(project_directory=args.project_dir,
-        project_directory=args.project_dir,
+                           output_directory=args.output_dir,
-        output_directory=args.output_dir,
+                           header_include_dirs=[],
-        header_include_dirs=[],
+                           includes=includes,
-        includes=includes,
+                           extra_files=extra_files,
-        extra_files=extra_files,
+                           show_detailed=True,
-        show_detailed=True,
+                           is_pytorch_extension=True,
-        is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
        hipify_extra_files_only=True,
    )
    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
-        hipified_s_abs = (
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-            hipify_result[s_abs].hipified_path
+                          (s_abs in hipify_result
-            if (
+                           and hipify_result[s_abs].hipified_path is not None)
-                s_abs in hipify_result
+                          else s_abs)
                and hipify_result[s_abs].hipified_path is not None
            )
            else s_abs
        )
        hipified_sources.append(hipified_s_abs)
-    assert len(hipified_sources) == len(args.sources)
+    assert (len(hipified_sources) == len(args.sources))
    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
  list(REMOVE_DUPLICATES _PTX_ARCHS)
  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
-  # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
  set(_CUDA_ARCHS)
  foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "[af]$")
+    if(_arch MATCHES "\\a$")
      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
+      string(REPLACE "a" "" _base "${_arch}")
      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -135,10 +135,10 @@ public:
    max_splits = min(16, max_splits);
    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
+    // there is more than 4 kv_splits. 
    // Discuss with NVIDIA how this can be fixed.
    if (B > 1) {
-      max_splits = min(1, max_splits);
+      max_splits = min(2, max_splits);
    }
    // printf("    max_splits = %d\n", max_splits);
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-        auto local_split_kv = params.split_kv;
+	auto local_split_kv = params.split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (params.ptr_split_kv != nullptr) {
+	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-        if (local_split_kv <= get<3>(blk_coord))
+	if (local_split_kv <= get<3>(blk_coord))
-          continue;
+	  continue;
        load_page_table(
          blk_coord,
          problem_shape,
          params.mainloop,
          shared_storage.tensors,
          pipeline_page_table, pipeline_pt_producer_state,
-          local_split_kv
+	  local_split_kv
        );
      }
    }
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        CUTLASS_PRAGMA_NO_UNROLL
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
-          auto problem_shape = params.problem_shape;
+	  auto problem_shape = params.problem_shape;
-          auto local_split_kv = params.split_kv;
+	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-            if (params.ptr_split_kv != nullptr) {
+	    if (params.ptr_split_kv != nullptr) {
              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-          if (local_split_kv <= get<3>(blk_coord))
+	  if (local_split_kv <= get<3>(blk_coord))
            continue;
          load_cpasync(
            blk_coord,
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            params.mainloop_params,
            shared_storage.tensors,
            pipeline_load_qk, pipeline_load_qk_producer_state,
-            local_split_kv,
+	    local_split_kv,
            /* must be shared pipe */
            pipeline_page_table, pipeline_pt_consumer_state
          );
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-            auto problem_shape = params.problem_shape;
+	    auto problem_shape = params.problem_shape;
-            auto local_split_kv = params.split_kv;
+	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-              if (params.ptr_split_kv != nullptr) {
+	      if (params.ptr_split_kv != nullptr) {
-                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-              }
+	      }
            }
-            if (local_split_kv <= get<3>(blk_coord))
+	    if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma</* paged= */ true>(
              blk_coord,
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-              local_split_kv
+	      local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          CUTLASS_PRAGMA_NO_UNROLL
          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
            auto blk_coord = tile_scheduler.get_block_coord();
-            auto problem_shape = params.problem_shape;
+	    auto problem_shape = params.problem_shape;
-            auto local_split_kv = params.split_kv;
+	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-              if (params.ptr_split_kv != nullptr) {
+	      if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
-              }
+	      }
            }
-            if (local_split_kv <= get<3>(blk_coord))
+	    if (local_split_kv <= get<3>(blk_coord))
              continue;
            load_tma<false>(
              blk_coord,
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
              shared_storage.tensors,
              pipeline_load_qk, pipeline_load_qk_producer_state,
              pipeline_load_qk, pipeline_load_qk_producer_state,
-              local_split_kv
+	      local_split_kv
            );
            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
          }
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
          auto blk_coord = tile_scheduler.get_block_coord();
          auto problem_shape = params.problem_shape;
-          auto local_split_kv = params.split_kv;
+	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
            if (params.ptr_split_kv != nullptr) {
                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
          }
-          if (local_split_kv <= get<3>(blk_coord))
+	  if (local_split_kv <= get<3>(blk_coord))
            continue;
          mma(blk_coord,
            problem_shape,
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
            pipeline_mma_s, pipeline_mma_s_producer_state,
            pipeline_p_mma, pipeline_p_mma_consumer_state,
            pipeline_mma_o, pipeline_mma_o_producer_state,
-            local_split_kv
+	    local_split_kv
          );
        }
      }
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
        auto blk_coord = tile_scheduler.get_block_coord();
        auto problem_shape = params.problem_shape;
-        auto split_kv = params.split_kv;
+	auto split_kv = params.split_kv;
-        auto local_split_kv = split_kv;
+	auto local_split_kv = split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (params.ptr_split_kv != nullptr) {
+	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
        }
-        if (local_split_kv <= get<3>(blk_coord))
+	if (local_split_kv <= get<3>(blk_coord))
          continue;
        compute(
          blk_coord,
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
          pipeline_mma_s, pipeline_mma_s_consumer_state,
          pipeline_p_mma, pipeline_p_mma_producer_state,
          pipeline_mma_o, pipeline_mma_o_consumer_state,
-          local_split_kv
+	  local_split_kv
        );
      }
@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
      cutlass::arch::NamedBarrier(
          (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
          kNamedBarrierEpilogue
-      ).arrive_and_wait();
+      ).arrive();
      return;
    }
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -56,11 +56,3 @@ void cp_gather_cache(
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
 // Indexer K quantization and cache function
 void indexer_k_quant_and_cache(
    torch::Tensor& k,             // [num_tokens, head_dim]
    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
    torch::Tensor& slot_mapping,  // [num_tokens]
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -16,7 +16,8 @@
 #include <algorithm>
 #include <cassert>
-#include <cfloat>
+#include <map>
 #include <vector>
 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -208,20 +209,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
 namespace vllm {
 // Used to copy/convert one element
 template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
 struct CopyWithScaleOp {
  float scale;
  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
      dst = static_cast<OutT>(src);
    } else {
      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
    }
  }
 };
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -237,51 +224,59 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
    // Padding token that should be ignored.
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  const int h_block_count = head_size / x;  // head_size//x
-  const int h_block_idx = threadIdx.x;
+  const int n = num_heads * head_size;
-  if (h_block_idx >= num_heads * h_block_count) {
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    return;
+    const int64_t src_key_idx = token_idx * key_stride + i;
-  }
+    const int64_t src_value_idx = token_idx * value_stride + i;
-  const int head_idx = h_block_idx / h_block_count;
+    const int head_idx = i / head_size;
-  const int h_block = h_block_idx % h_block_count;
+    const int head_offset = i % head_size;
    const int x_idx = head_offset / x;
    const int x_offset = head_offset % x;
-  const scalar_t* __restrict__ key_src =
+    const int64_t tgt_key_idx =
-      key + token_idx * key_stride + head_idx * head_size + h_block * x;
+        block_idx * num_heads * (head_size / x) * block_size * x +
-  const int64_t src_value_start =
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
-      token_idx * value_stride + head_idx * head_size + h_block * x;
+        block_offset * x + x_offset;
-
+    const int64_t tgt_value_idx =
-  cache_t* __restrict__ key_dst =
+        block_idx * num_heads * head_size * block_size +
-      key_cache + block_idx * num_heads * h_block_count * block_size * x +
+        head_idx * head_size * block_size + head_offset * block_size +
-      head_idx * h_block_count * block_size * x + h_block * block_size * x +
+        block_offset;
-      block_offset * x;
+    scalar_t tgt_key = key[src_key_idx];
-  const int64_t tgt_value_start =
+    scalar_t tgt_value = value[src_value_idx];
-      block_idx * num_heads * h_block_count * x * block_size +
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      head_idx * h_block_count * x * block_size + h_block * x * block_size +
+      key_cache[tgt_key_idx] = tgt_key;
-      block_offset;
+      value_cache[tgt_value_idx] = tgt_value;
-
+    } else {
-  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+      key_cache[tgt_key_idx] =
-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+      value_cache[tgt_value_idx] =
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+    }
  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
  const scalar_t* __restrict__ value_src = value + src_value_start;
  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
 #pragma unroll
  for (int i = 0; i < x; i++) {
    v_op(value_dst[i * block_size], value_src[i]);
  }
 }
 // Used by vectorization_utils to copy/convert one element
 template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
 struct CopyWithScaleOp {
  float scale;
  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
      dst = static_cast<OutT>(src);
    } else {
      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
    }
  }
 };
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -401,177 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void concat_and_cache_ds_mla_kernel(
    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
                                     // + pe_dim)]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
    const int block_stride,                    //
    const int entry_stride,                    //
    const int kv_c_stride,                     //
    const int k_pe_stride,                     //
    const int kv_lora_rank,                    //
    const int pe_dim,                          //
    const int block_size,                      //
    const float* scale                         //
 ) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
  if (slot_idx < 0) {
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  const int64_t dst_idx_start =
      block_idx * block_stride + block_offset * entry_stride;
  // For the NoPE part, each tile of 128 elements is handled by half of one warp
  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
  // So in total, we use 3 warps (96 threads) per block.
  // Cast kv_cache to 16_bit for RoPE values
  scalar_t* kv_cache_16bit =
      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
  // The last warp handles the RoPE part
  if (threadIdx.x >= 64) {
    // Each thread handles two elements of RoPE
    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
    // Vectorized load of two 16-bit values, performed as one 32-bit load
    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
    // RoPE values start after the packed 8-bit NoPE values and the
    // 32-bit scales
    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
    // Vectorized store of two 16-bit values, performed as one 32-bit store
    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
    return;
  }
  // The first two warps handle the NoPE part
  const int8_t warp_idx = threadIdx.x >> 5;
  const int8_t lane_idx = threadIdx.x & 31;
  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
  // Each thread handles 8 elements of NoPE
  // Load the NoPE elements for this thread into registers
  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
  // Vectorized load of eight 16-bit values, performed as an int4 load
  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
  // Max absolute value of this thread's elements
  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
  // Warp-level reduction to find the max absolute value in each half-warp
 #pragma unroll
  for (int offset = 8; offset > 0; offset /= 2) {
    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
  }
  // Compute the scale for the tile
  float tile_scale = max_abs / 448.f;
  tile_scale = fmaxf(tile_scale, FLT_MIN);
  // The first lane of each half-warp writes the scale to kv_cache
  if ((lane_idx == 0) || (lane_idx == 16)) {
    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
    kv_cache_32bit[dst_idx] = tile_scale;
  }
  // Now all threads in the block scale and write their elements
  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
  uint8_t result[8];
 #pragma unroll
  for (int i = 0; i < 8; i++) {
    result[i] =
        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
            vals[i], tile_scale);
  }
  // Store as aligned 64-bit writes
  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
      *reinterpret_cast<const uint64_t*>(result);
 }
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void indexer_k_quant_and_cache_kernel(
    const scalar_t* __restrict__ k,  // [num_tokens, head_dim]
    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, cache_stride]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
    const int head_dim,                        // dimension of each head
    const int quant_block_size,                // quantization block size
    const int cache_block_size,                // cache block size
    const int cache_stride,  // stride for each token in kv_cache
    const bool use_ue8m0     // use ue8m0 scale format
 ) {
  constexpr int VEC_SIZE = 4;
  const int64_t token_idx = blockIdx.x;
  const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
                                threadIdx.y * blockDim.x + threadIdx.x) *
                               VEC_SIZE;
  const int64_t slot_idx = slot_mapping[token_idx];
  const int64_t block_idx = slot_idx / cache_block_size;
  const int64_t block_offset = slot_idx % cache_block_size;
  // NOTE: slot_idx can be -1 if the token is padded
  if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
    return;
  }
  float2 k_val = (reinterpret_cast<const float2*>(
      k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
  scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
  float amax = 0.0f;
  for (int i = 0; i < VEC_SIZE; i++) {
    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
  }
 #ifndef USE_ROCM
  __syncwarp();
 #endif
  // Reduced amax
  for (int mask = 16; mask > 0; mask /= 2) {
 #ifdef USE_ROCM
    amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
 #else
    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
 #endif
  }
 #ifndef USE_ROCM
  __syncwarp();
 #endif
  float scale = fmaxf(amax, 1e-4) / 448.0f;
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
  const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
                             block_offset * head_dim + head_dim_idx;
  for (int i = 0; i < VEC_SIZE; i++) {
    kv_cache[dst_offset + i] =
        fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
  }
  if (threadIdx.x == 0) {
    const int64_t dst_scale_idx =
        block_idx * cache_block_size * cache_stride +
        cache_block_size * head_dim +
        (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
    reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
  }
 }
 }  // namespace vllm
 // KV_T is the data type of key and value tensors.
@ -607,15 +431,14 @@ void reshape_and_cache(
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
  int head_div_x = head_size / x;
  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_div_x, 512));
+  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
-                             CALL_RESHAPE_AND_CACHE);
+                             CALL_RESHAPE_AND_CACHE)
 }
 // KV_T is the data type of key and value tensors.
@ -686,18 +509,6 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));
 // KV_T is the data type of key and value tensors.
 // CACHE_T is the stored data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE)           \
  vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE>         \
      <<<grid, block, 0, stream>>>(                                     \
          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -720,43 +531,20 @@ void concat_and_cache_mla(
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);
-  if (kv_cache_dtype == "fp8_ds_mla") {
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
    TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
    TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
    TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
                "kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
    TORCH_CHECK(kv_c.itemsize() == 2,
                "kv_c.itemsize() must be 2 for fp8_ds_mla");
    TORCH_CHECK(k_pe.itemsize() == 2,
                "k_pe.itemsize() must be 2 for fp8_ds_mla");
  } else {
    TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
  }
  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  int entry_stride = kv_cache.stride(1);
  dim3 grid(num_tokens);
  dim3 block(std::min(kv_lora_rank, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (kv_cache_dtype == "fp8_ds_mla") {
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-    dim3 grid(num_tokens);
+                             CALL_CONCAT_AND_CACHE_MLA);
    // For the NoPE part, each tile of 128 elements is handled by half of one
    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
    // The RoPE part (last 64 elements) is handled by another 1 warp (32
    // threads). So in total, we use 3 warps (96 threads) per block.
    dim3 block(96);
    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                               CALL_CONCAT_AND_CACHE_DS_MLA);
  } else {
    dim3 grid(num_tokens);
    dim3 block(std::min(kv_lora_rank, 512));
    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                               CALL_CONCAT_AND_CACHE_MLA);
  }
 }
 namespace vllm {
@ -1134,42 +922,3 @@ void cp_gather_cache(
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
 }
 // Macro to dispatch the kernel based on the data type.
 #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
      <<<grid, block, 0, stream>>>(                                     \
          reinterpret_cast<KV_T*>(k.data_ptr()),                        \
          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
          slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
          cache_block_size, cache_stride, use_ue8m0);
 void indexer_k_quant_and_cache(
    torch::Tensor& k,             // [num_tokens, head_dim]
    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
    torch::Tensor& slot_mapping,  // [num_tokens]
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt) {
  int num_tokens = k.size(0);
  int head_dim = k.size(1);
  int cache_block_size = kv_cache.size(1);
  int cache_stride = kv_cache.size(2);
  bool use_ue8m0 = scale_fmt == "ue8m0";
  TORCH_CHECK(k.device() == kv_cache.device(),
              "k and kv_cache must be on the same device");
  TORCH_CHECK(k.device() == slot_mapping.device(),
              "k and slot_mapping must be on the same device");
  TORCH_CHECK(head_dim % quant_block_size == 0,
              "head_dim must be divisible by quant_block_size");
  constexpr int vec_size = 4;
  dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
                            (quant_block_size * vec_size));
  dim3 block(32, vec_size);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
                             CALL_INDEXER_K_QUANT_AND_CACHE);
 }
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -1,16 +0,0 @@
 #pragma once
 #include <cstdlib>
 #include <string>
 #include <cctype>
 namespace vllm {
 // vllm_kernel_override_batch_invariant(); returns true
 // if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
 inline bool vllm_kernel_override_batch_invariant() {
  std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
  const char* val = std::getenv(env_key.c_str());
  return (val && std::atoi(val) != 0) ? 1 : 0;
 }
 }  // namespace vllm
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -14,12 +14,7 @@
  // arm implementation
  #include "cpu_types_arm.hpp"
 #else
-  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
+  #warning "unsupported vLLM cpu implementation"
  #include "cpu_types_scalar.hpp"
 #endif
 #ifdef _OPENMP
  #include <omp.h>
 #endif
 #endif
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@ -1,513 +0,0 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <torch/all.h>
 #include "float_convert.hpp"
 namespace vec_op {
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
  #define CPU_KERNEL_GUARD_IN(NAME) \
    std::cout << #NAME << " invoked." << std::endl;
  #define CPU_KERNEL_GUARD_OUT(NAME) \
    std::cout << #NAME << " exit." << std::endl;
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
 #define __max(a, b) ((a) > (b) ? (a) : (b))
 #define __min(a, b) ((a) < (b) ? (a) : (b))
 #define __abs(a) ((a) < (0) ? (0 - a) : (a))
 typedef struct f16x8_t {
  uint16_t val[8];
 } f16x8_t;
 typedef struct f16x16_t {
  uint16_t val[16];
 } f16x16_t;
 typedef struct f16x32_t {
  uint16_t val[32];
 } f16x32_t;
 typedef struct f32x4_t {
  float val[4];
 } f32x4_t;
 typedef struct f32x8_t {
  float val[8];
 } f32x8_t;
 typedef struct f32x16_t {
  float val[16];
 } f32x16_t;
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
  (f(std::integral_constant<T, indexes>{}), ...);
 };
 };  // namespace
 template <typename T, T count, typename F,
          typename = std::enable_if_t<std::is_invocable_v<F, T> > >
 constexpr void unroll_loop(F&& f) {
  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 template <typename T>
 struct Vec {
  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 struct FP32Vec8;
 struct FP32Vec16;
 struct FP16Vec8 : public Vec<FP16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  f16x8_t reg;
  explicit FP16Vec8(const void* ptr)
      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
  explicit FP16Vec8(const FP32Vec8&);
  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
 };
 struct FP16Vec16 : public Vec<FP16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  f16x16_t reg;
  explicit FP16Vec16(const void* ptr)
      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
  explicit FP16Vec16(const FP32Vec16&);
  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    int num = __min(elem_num, VEC_ELEM_NUM);
    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
  }
 };
 struct BF16Vec8 : public Vec<BF16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  f16x8_t reg;
  explicit BF16Vec8(const void* ptr)
      : reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
  explicit BF16Vec8(const FP32Vec8&);
  void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
 };
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  f16x16_t reg;
  explicit BF16Vec16(const void* ptr)
      : reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
  explicit BF16Vec16(const FP32Vec16&);
  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    int num = __min(elem_num, VEC_ELEM_NUM);
    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
  }
 };
 struct BF16Vec32 : public Vec<BF16Vec32> {
  constexpr static int VEC_ELEM_NUM = 32;
  f16x32_t reg;
  explicit BF16Vec32(const void* ptr)
      : reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
  explicit BF16Vec32(f16x32_t data) : reg(data) {};
  explicit BF16Vec32(BF16Vec8& vec8_data) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
    }
  }
  void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
 };
 struct FP32Vec4 : public Vec<FP32Vec4> {
  constexpr static int VEC_ELEM_NUM = 4;
  f32x4_t reg;
  explicit FP32Vec4(float v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = v;
    }
  }
  explicit FP32Vec4() {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = 0.0f;
    }
  }
  explicit FP32Vec4(const float* ptr)
      : reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
  explicit FP32Vec4(f32x4_t data) : reg(data) {};
  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
 };
 struct FP32Vec8 : public Vec<FP32Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  f32x8_t reg;
  explicit FP32Vec8(float v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = v;
    }
  }
  explicit FP32Vec8() {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = 0.0f;
    }
  }
  explicit FP32Vec8(const float* ptr)
      : reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
  explicit FP32Vec8(f32x8_t data) : reg(data) {};
  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
  explicit FP32Vec8(const FP16Vec8& v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = fp16_to_float(v.reg.val[i]);
    }
  }
  FP32Vec8(const BF16Vec8& v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = bf16_to_float(v.reg.val[i]);
    }
  }
  float reduce_sum() const {
    float result = 0;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result += reg.val[i];
    }
    return result;
  }
  FP32Vec8 exp() const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = expf(reg.val[i]);
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 tanh() const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = tanhf(reg.val[i]);
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 er() const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = erf(reg.val[i]);
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 operator*(const FP32Vec8& b) const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = reg.val[i] * b.reg.val[i];
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 operator+(const FP32Vec8& b) const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = reg.val[i] + b.reg.val[i];
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 operator-(const FP32Vec8& b) const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = reg.val[i] - b.reg.val[i];
    }
    return FP32Vec8(ret);
  }
  FP32Vec8 operator/(const FP32Vec8& b) const {
    f32x8_t ret;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      ret.val[i] = reg.val[i] / b.reg.val[i];
    }
    return FP32Vec8(ret);
  }
  void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
 };
 struct FP32Vec16 : public Vec<FP32Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  f32x16_t reg;
  explicit FP32Vec16(float v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = v;
    }
  }
  explicit FP32Vec16() {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = 0.0f;
    }
  }
  explicit FP32Vec16(const float* ptr)
      : reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
  explicit FP32Vec16(f32x16_t data) : reg(data) {};
  FP32Vec16(const FP32Vec4& data) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
    }
  }
  FP32Vec16(const FP32Vec8& data) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
    }
  }
  FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
  explicit FP32Vec16(const FP16Vec16& v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = fp16_to_float(v.reg.val[i]);
    }
  }
  explicit FP32Vec16(const BF16Vec16& v) {
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      reg.val[i] = bf16_to_float(v.reg.val[i]);
    }
  }
  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
  FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
  FP32Vec16 operator*(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = reg.val[i] * b.reg.val[i];
    }
    return result;
  }
  FP32Vec16 operator+(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = reg.val[i] + b.reg.val[i];
    }
    return result;
  }
  FP32Vec16 operator-(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = reg.val[i] - b.reg.val[i];
    }
    return result;
  }
  FP32Vec16 operator/(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = reg.val[i] / b.reg.val[i];
    }
    return result;
  }
  FP32Vec16 max(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
    }
    return result;
  }
  FP32Vec16 min(const FP32Vec16& b) const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
    }
    return result;
  }
  FP32Vec16 abs() const {
    FP32Vec16 result(0.0f);
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result.reg.val[i] = __abs(reg.val[i]);
    }
    return result;
  }
  float reduce_sum() const {
    float result = 0.0f;
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result += reg.val[i];
    }
    return result;
  }
  float reduce_max() const {
    float result = reg.val[0];
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result = __max(reg.val[i], result);
    }
    return result;
  }
  float reduce_min() const {
    float result = reg.val[0];
    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
      result = __min(reg.val[i], result);
    }
    return result;
  }
  template <int group_size>
  float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    float sum = 0.0;
    int start = idx * group_size;
    int end = (idx + 1) * group_size;
    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
      sum += reg.val[start];
    }
    return sum;
  }
  void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
 };
 template <typename T>
 struct VecType {
  using vec_type = void;
 };
 template <typename T>
 using vec_t = typename VecType<T>::vec_type;
 template <>
 struct VecType<float> {
  using vec_type = FP32Vec8;
 };
 template <>
 struct VecType<c10::Half> {
  using vec_type = FP16Vec8;
 };
 template <>
 struct VecType<c10::BFloat16> {
  using vec_type = BF16Vec8;
 };
 template <typename T>
 void storeFP32(float v, T* ptr) {
  *ptr = v;
 }
 /*
 template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
  c10::Half __attribute__((__may_alias__)) *v_ptr =
      reinterpret_cast<c10::Half *>(&v);
  *ptr = *(v_ptr + 1);
 }
 */
 template <>
 inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
  uint16_t fp16 = float_to_fp16(v);
  *reinterpret_cast<uint16_t*>(ptr) = fp16;
 }
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
      reinterpret_cast<c10::BFloat16*>(&v);
  *ptr = *(v_ptr + 1);
 }
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
  int i = 0;
  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
    reg.val[i] = float_to_fp16(v.reg.val[i]);
  }
 }
 inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
  int i = 0;
  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
    reg.val[i] = float_to_fp16(v.reg.val[i]);
  }
 }
 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
  acc = acc + a * b;
 }
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
  int i = 0;
  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
    reg.val[i] = float_to_bf16(v.reg.val[i]);
  }
 }
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
  int i = 0;
  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
    reg.val[i] = float_to_bf16(v.reg.val[i]);
  }
 }
 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
 };  // namespace vec_op
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
 }
 void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc original_b_md,
+    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
-    dnnl::memory::desc b_target_mem_desc) {
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
  {
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
    assert(!use_azp_);
  };
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+  prepack_weight(args.b_ptr,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  assert(ab_type_ == dnnl::memory::data_type::f32 ||
         ab_type_ == dnnl::memory::data_type::bf16 ||
         ab_type_ == dnnl::memory::data_type::f16);
-
+  prepack_weight(args.b_ptr,
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-#ifdef VLLM_USE_ACL
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-                         // Arm Compute Library (ACL) backend for oneDNN does
+                                   .use_bias = false,
-                         // not support runtime
+                                   .bias_type = dnnl::memory::data_type::undef},
                         // dimensions, so we set M to a default value
                         .a_m_size = 128,
                         .a_m_stride = b_k_size_,
 #else
                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
 #endif
                         .use_bias = false,
                         .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;
 #ifndef VLLM_USE_ACL
  // We do not support in ACL backend of oneDNN, we handle bias by:
  // 1. copying it into the result tensor
  // 2. attaching a fused-sum post-op to the matmul primitive
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
 #endif
  dnnl::matmul matmul = get_matmul_cache(args);
-// With ACL backend of oneDNN, the required memory format might change when the
+  dnnl::matmul matmul = get_matmul_cache(args);
 // source tensor dims change. This does not really happen in practice, so isn't
 // a performance hit, but we need to support it because the API allows for it.
 #ifdef VLLM_USE_ACL
  auto new_expected_wei_desc =
      dnnl::matmul::primitive_desc(
          const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
          .weights_desc();
  if (new_expected_wei_desc != b_target_mem_desc_) {
    prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
                   b_target_mem_desc_, new_expected_wei_desc);
  }
 #endif
  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
  scratchpad_storage->set_data_handle(
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  } else {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              {key.a_m_stride, 1});
 #ifdef VLLM_USE_ACL
    // ACL's backend of oneDNN always expects the weight format to be "any"
    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
                              dnnl::memory::format_tag::any);
 #else
    b_md = b_target_mem_desc_;
 #endif
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
  if (key.use_bias) {
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
 // Since ACL's matmuls don't support passing a bias_md, we apply the bias
 // through a fused-sum post-op
 #ifdef VLLM_USE_ACL
    dnnl::post_ops post_ops;
    post_ops.append_sum();
    attr.set_post_ops(post_ops);
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
 #else
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
 #endif
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
 // ACL matmuls don't support bias_md, so we don't need these
 #ifndef VLLM_USE_ACL
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-#endif
+
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 bool is_onednn_acl_supported() {
 #ifdef VLLM_USE_ACL
  return true;
 #else
  return false;
 #endif
 }
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
 protected:
  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
-  void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
+  void prepack_weight(void* original_b_ptr,
                      dnnl::memory::desc b_target_mem_desc);
  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
  torch::Tensor a_contig = a.contiguous();
 #endif
  MatMulPrimitiveHandler::ExecArgs exec_args;
 #ifdef VLLM_USE_ACL
  exec_args.a_m_size = a_contig.size(0);
  exec_args.a_m_stride = a_contig.stride(0);
 #else
  exec_args.a_m_size = a.size(0);
  exec_args.a_m_stride = a.stride(0);
-#endif
+
  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
    if (bias.has_value()) {
      exec_args.use_bias = true;
      exec_args.bias_type = get_dnnl_type<scalar_t>();
 #ifdef VLLM_USE_ACL
      // ACL matmuls in oneDNN do not support a bias.
      // We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
      c.copy_(bias.value());
 #else
      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
 #endif
    } else {
      exec_args.use_bias = false;
      exec_args.bias_type = get_dnnl_type<void>();
      exec_args.bias_ptr = nullptr;
    }
 #ifdef VLLM_USE_ACL
    exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
 #else
    exec_args.a_ptr = a.data_ptr<scalar_t>();
 #endif
    exec_args.c_ptr = c.data_ptr<scalar_t>();
    ptr->execute(exec_args);
--- a/csrc/cpu/float_convert.hpp
+++ b/csrc/cpu/float_convert.hpp
@ -1,106 +0,0 @@
 static float bf16_to_float(uint16_t bf16) {
  uint32_t bits = static_cast<uint32_t>(bf16) << 16;
  float fp32;
  std::memcpy(&fp32, &bits, sizeof(fp32));
  return fp32;
 }
 static uint16_t float_to_bf16(float fp32) {
  uint32_t bits;
  std::memcpy(&bits, &fp32, sizeof(fp32));
  return static_cast<uint16_t>(bits >> 16);
 }
 /************************************************
 * Copyright (c) 2015 Princeton Vision Group
 * Licensed under the MIT license.
 * Codes below copied from
 * https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
 *************************************************/
 static uint16_t float_to_fp16(float fp32) {
  uint16_t fp16;
  unsigned x;
  unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
  unsigned sign, exponent, mantissa;
  std::memcpy(&x, &fp32, sizeof(fp32));
  u = (x & 0x7fffffff);
  // Get rid of +NaN/-NaN case first.
  if (u > 0x7f800000) {
    fp16 = 0x7fffU;
    return fp16;
  }
  sign = ((x >> 16) & 0x8000);
  // Get rid of +Inf/-Inf, +0/-0.
  if (u > 0x477fefff) {
    fp16 = sign | 0x7c00U;
    return fp16;
  }
  if (u < 0x33000001) {
    fp16 = (sign | 0x0000);
    return fp16;
  }
  exponent = ((u >> 23) & 0xff);
  mantissa = (u & 0x7fffff);
  if (exponent > 0x70) {
    shift = 13;
    exponent -= 0x70;
  } else {
    shift = 0x7e - exponent;
    exponent = 0;
    mantissa |= 0x800000;
  }
  lsb = (1 << shift);
  lsb_s1 = (lsb >> 1);
  lsb_m1 = (lsb - 1);
  // Round to nearest even.
  remainder = (mantissa & lsb_m1);
  mantissa >>= shift;
  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
    ++mantissa;
    if (!(mantissa & 0x3ff)) {
      ++exponent;
      mantissa = 0;
    }
  }
  fp16 = (sign | (exponent << 10) | mantissa);
  return fp16;
 }
 static float fp16_to_float(uint16_t fp16) {
  unsigned sign = ((fp16 >> 15) & 1);
  unsigned exponent = ((fp16 >> 10) & 0x1f);
  unsigned mantissa = ((fp16 & 0x3ff) << 13);
  int temp;
  float fp32;
  if (exponent == 0x1f) { /* NaN or Inf */
    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
    exponent = 0xff;
  } else if (!exponent) { /* Denorm or Zero */
    if (mantissa) {
      unsigned int msb;
      exponent = 0x71;
      do {
        msb = (mantissa & 0x400000);
        mantissa <<= 1; /* normalize */
        --exponent;
      } while (!msb);
      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
    }
  } else {
    exponent += 0x70;
  }
  temp = ((sign << 31) | (exponent << 23) | mantissa);
  std::memcpy(&fp32, &temp, sizeof(temp));
  return fp32;
 }
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
               const std::optional<torch::Tensor>& bias, int64_t handler);
 bool is_onednn_acl_supported();
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
@ -90,18 +88,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    int tp_rank, int blocksparse_local_blocks,"
      "    int blocksparse_vert_stride, int blocksparse_block_size,"
      "    int blocksparse_head_sliding_step) -> ()");
  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
  ops.def(
      "dynamic_4bit_int_moe("
      "Tensor x, Tensor topk_ids, Tensor topk_weights,"
      "Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
      "int group_size, bool apply_router_weight_on_input, int activation_kind"
      ") -> Tensor");
  ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
@ -183,9 +171,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Check if oneDNN was built with ACL backend
  ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "u4b8",
        VLLMDataType.u8b128: "u8b128",
-    },
+    }
 }
 VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    },
+    }
 }
 VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **{
        VLLMDataType.u4b8: 4,
        VLLMDataType.u8b128: 8,
-    },
+    }
 }
 VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }
-VLLMKernelScheduleTag: dict[
+VLLMKernelScheduleTag: dict[Union[
-    Union[MixedInputKernelScheduleType, KernelScheduleType], str
+    MixedInputKernelScheduleType, KernelScheduleType], str] = {
-] = {
+        **KernelScheduleTag,  # type: ignore
-    **KernelScheduleTag,  # type: ignore
+        **{
-    **{
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
-        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
+            "cutlass::gemm::KernelTmaWarpSpecialized",
-        MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
-        MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
-    },
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
-}
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
        }
    }
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@ -1,64 +0,0 @@
 #pragma once
 #include <cuda_runtime_api.h>
 #include <algorithm>
 // maximum blocks per SM cap
 #ifndef VLLM_LAUNCH_BLOCKS_CAP
  #define VLLM_LAUNCH_BLOCKS_CAP 4
 #endif
 // Compile-time estimate of max threads per SM for launch bounds.
 // Families: 1024, 1536, 2048 threads/SM.
 #ifndef VLLM_MAX_THREADS_PER_SM
  #ifdef __CUDA_ARCH__
    /* 1024 thr/SM: Turing (sm_75) */
    #if (__CUDA_ARCH__ == 750)
      #define VLLM_MAX_THREADS_PER_SM 1024
    /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
        GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
    #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
        (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) ||  \
        (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
        (__CUDA_ARCH__ == 1210)
      #define VLLM_MAX_THREADS_PER_SM 1536
    /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
        Hopper (sm_90), Blackwell (sm_100/103) */
    #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
        (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) ||   \
        (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
      #define VLLM_MAX_THREADS_PER_SM 2048
    /* Fallback: use 2048 for unknown future CCs */
    #else
      #define VLLM_MAX_THREADS_PER_SM 2048
    #endif
  #else
  /* Host pass (no __CUDA_ARCH__): neutral default */
    #define VLLM_MAX_THREADS_PER_SM 2048
  #endif
 #endif
 // compute the number of blocks per SM to request in __launch_bounds__
 #define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
 #define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
  (((VAL) <= 0)                       \
       ? 1                            \
       : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
 #define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
  VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
 // runtime-time helper to compute blocks/SM
 static inline int vllm_runtime_blocks_per_sm(int block_threads) {
  int device = -1;
  cudaGetDevice(&device);
  int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
  cudaDeviceGetAttribute(&max_threads_per_sm,
                         cudaDevAttrMaxThreadsPerMultiProcessor, device);
  int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
  return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
 }
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,7 +1,6 @@
 #include "type_convert.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -414,9 +413,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
@ -462,8 +459,7 @@ void poly_norm(torch::Tensor& out,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -9,7 +9,6 @@
 #include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -241,9 +240,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@ -1,156 +0,0 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <torch/all.h>
 // _dyn_quant_matmul_4bit is only available on AArch64.
 #if defined(__aarch64__)
  #include <ATen/ops/_dyn_quant_matmul_4bit.h>
 #endif
 inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
                        int64_t group_size_eff, int64_t in_features,
                        int64_t out_features) {
 #if defined(__aarch64__)
  return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
                                                in_features, out_features);
 #else
  TORCH_CHECK(false,
              "dynamic 4-bit int MoE path requires AArch64 (ARM64); "
              "_dyn_quant_matmul_4bit is unavailable on this architecture");
  return {};
 #endif
 }
 enum ActivationKind : int64_t {
  SwiGLU_Gu = 0,  // act = SiLU(g) * u
  SwiGLUOAI = 1,  // act = SiLU(u) * g
  SiLU = 2        // SiLU
 };
 torch::Tensor dynamic_4bit_int_moe_cpu(
    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
    int64_t activation_kind) {
  TORCH_CHECK(x.dim() == 2, "x must be 2D");
  TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
              "topk tensors must be [T, K]");
  TORCH_CHECK(
      w13_packed.size(0) == w2_packed.size(0),
      "w13_packed and w2_packed must have same number of experts in dim 0");
  TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
  const int64_t T = x.size(0);
  const int64_t K = topk_ids.size(1);
  const int64_t E = w13_packed.size(0);
  const int64_t N = T * K;
  auto x_c = x.contiguous();
  auto ids_c = topk_ids.contiguous();
  auto gates_c = topk_weights.to(at::kFloat).contiguous();
  // bucketing tokens -> experts
  c10::SmallVector<int64_t, 64> counts(
      E, 0);  // Small vector uses stack allocation
  {
    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
    for (int64_t i = 0; i < N; ++i) {
      const int64_t e_id = ids_ptr[i];
      TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
      counts[e_id]++;
    }
  }
  c10::SmallVector<int64_t, 65> offsets(E + 1, 0);  // ( E +1 )
  for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
  auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
  auto expert_gates = at::empty({offsets[E]}, gates_c.options());
  {
    c10::SmallVector<int64_t, 64> cursor(E, 0);
    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
    const auto* gts_ptr = gates_c.data_ptr<float>();
    auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
    auto* gate_ptr = expert_gates.data_ptr<float>();
    for (int64_t t = 0; t < T; ++t) {
      const int64_t base = t * K;
      for (int64_t k = 0; k < K; ++k) {
        const int64_t idx = base + k;
        const int64_t e = ids_ptr[idx];
        const int64_t p = offsets[e] + (cursor[e]++);
        tok_ptr[p] = t;
        gate_ptr[p] = gts_ptr[idx];
      }
    }
  }
  const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
  const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
  // Per-expert outputs filled in parallel
  std::vector<torch::Tensor> y_list(E);
  y_list.resize(E);
  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
    for (int64_t e = e_begin; e < e_end; ++e) {
      const int64_t te = counts[e];
      if (te == 0) {
        y_list[e] = at::empty({0, H}, x_c.options());
        continue;
      }
      const int64_t start = offsets[e];
      auto sel_tokens =
          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
      auto gates_e =
          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
      if (apply_router_weight_on_input) {
        x_e = x_e.mul(gates_e.unsqueeze(1));
      }
      auto w13_e = w13_packed.select(/*dim=*/0, e);
      auto w2_e = w2_packed.select(/*dim=*/0, e);
      // W13
      auto y13 =
          mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
      auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
      auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
      torch::Tensor act;
      if (activation_kind == ActivationKind::SwiGLUOAI) {  // SwiGLUOAI
        constexpr double kAlpha = 1.702;                   // GPT-OSS default
        constexpr double kLimit = 7.0;                     // GPT-OSS default
        auto gate_c = at::clamp_max(g_part, kLimit);
        auto up_c = at::clamp(u_part, -kLimit, kLimit);
        auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
        act = up_c.add(1.0).mul(glu);
      } else {  // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
        act = at::silu(g_part).mul(u_part);
      }
      // W2
      auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
      if (!apply_router_weight_on_input) {
        y = y.mul(gates_e.unsqueeze(1));
      }
      // Store per-expert result
      y_list[e] = y;
    }
  });
  // Concatenate all expert outputs to match expert_tokens order
  auto Y_all = at::cat(y_list, /*dim=*/0);
  auto out = at::zeros({T, H}, x.options());
  out =
      at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
  return out;
 }
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -21,7 +21,6 @@
 #include <torch/all.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <cuda/std/limits>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {
 constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@ -411,30 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
  return __bfloat162float(val);
 }
 template <typename T>
 __device__ inline T neg_inf() {
  // cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
  // so we need to cast from fp32
  return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
 }
 template <typename T>
 __device__ inline bool is_finite(const T val) {
 #if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
  return cuda::std::isfinite(val);
 #else
  return isfinite(cuda_cast<float, T>(val));
 #endif
 }
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
                             int const num_experts_per_group) {
  // Get the top2 per thread
-  T largest = neg_inf<T>();
+  T largest = -INFINITY;
-  T second_largest = neg_inf<T>();
+  T second_largest = -INFINITY;
  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -529,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
      warp_id * topk;
  s_topk_idx += warp_id * topk;
-  T value = neg_inf<T>();
+  T value = kNegInfinity;
-  T topk_group_value = neg_inf<T>();
+  T topk_group_value = kNegInfinity;
  int32_t num_equalto_topkth_group;
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -541,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  if (case_id < num_tokens) {
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
-    // The check is necessary to avoid abnormal input
+    if (lane_id < n_group &&
-    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+        (isfinite(cuda_cast<float, T>(
            group_scores[lane_id]))))  // The check is necessary to avoid
                                       // abnormal input
    {
      value = group_scores[lane_id];
    }
@ -553,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
      __syncwarp();  // Ensure all threads have valid data before reduction
      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
      if (value == topk_group_value) {
-        value = neg_inf<T>();
+        value = kNegInfinity;
      }
      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value =
+      count_equal_to_top_value = __popc(__ballot_sync(
-          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
    }
    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
  }
@ -565,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
-      queue((int32_t)topk, neg_inf<T>());
+      queue((int32_t)topk, -INFINITY);
  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
+  bool if_proceed_next_topk =
      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i_group = 0; i_group < n_group; i_group++) {
      if ((group_scores[i_group] > topk_group_value) ||
@ -577,10 +565,11 @@ __global__ void group_idx_and_topk_idx_kernel(
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
-          T candidates = (i < num_experts_per_group) &&
+          T candidates =
-                                 is_finite(scores_with_bias[offset + i])
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
-                             ? scores_with_bias[offset + i]
+                                                 scores_with_bias[offset + i]))
-                             : neg_inf<T>();
+                  ? scores_with_bias[offset + i]
                  : cuda_cast<T, float>(kNegInfinity);
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
@ -609,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
      if (i < topk) {
        s_topk_value[i] = value;
      }
-      topk_sum +=
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
    }
  }
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -17,30 +17,25 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
@ -63,12 +58,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(
  for (size_t i = tid; i < numel; i += stride) {
    int expert_id = topk_ids[i];
    if (expert_id >= num_experts) {
      continue;
    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@ -98,15 +95,12 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel, int32_t num_experts) {
+    size_t numel) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;
  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    if (expert_id >= num_experts) {
      continue;
    }
    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
    sorted_token_ids[rank_post_pad] = i;
  }
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
        }
      });
 }
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -21,7 +21,6 @@
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #include "../core/batch_invariant.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -328,12 +328,6 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const std::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
 torch::Tensor dynamic_4bit_int_moe_cpu(
    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
    int64_t activation_kind);
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -23,14 +23,9 @@
 typedef __hip_bfloat162 __nv_bfloat162;
 typedef __hip_bfloat16 __nv_bfloat16;
 typedef __hip_bfloat16_raw __nv_bfloat16_raw;
-  #if defined(HIP_FP8_TYPE_OCP)
+
 typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
 typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
  #else
 // ROCm 6.2 fallback: only *_fnuz types exist
 typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
 typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
  #endif
 #endif
 #include "core/registration.h"
@ -370,6 +365,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
  int32_t compute_pipeline_offset_64 = 0;
  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
    __nv_bfloat16 y_max_bf16 = EPS;
    __nv_bfloat162 results_bf162[2];
    cp_async_wait<NUM_STAGES - 2>();
@ -409,7 +405,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    auto _y_max2 =
        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
-    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
+    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
    // An entire group is assigned to a single warp, so a simple warp reduce
    // is used.
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
      } else {
          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
-            Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
            out, a, b, a_scales, b_scales);
      }
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
    // TMA epilogue isn't compatible with Swap A/B
    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
-        Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
        out, a, b, a_scales, b_scales);
  }
 }
-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
      if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
        int8_func(c, a, b, a_scales, b_scales, bias);
      } else {
-        int32_t version_num = get_sm_version_num();
+        TORCH_CHECK(false, "Int8 not supported for this architecture");
        TORCH_CHECK(
            false, "Int8 not supported on SM", version_num,
            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
      }
    }
  } else {
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
  }
 }
-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -67,9 +67,8 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
+#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
+    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
 void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -254,7 +253,7 @@ void cutlass_moe_mm(
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100 && version_num < 110) {
+  if (version_num >= 100) {
    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
@ -262,7 +261,7 @@ void cutlass_moe_mm(
  }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90 && version_num < 100) {
+  if (version_num >= 90) {
    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
                        c_strides, per_act_token, per_out_ch);
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -26,7 +26,6 @@
 #include "dispatch_utils.h"
 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"
 namespace vllm {
@ -64,7 +63,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
+__global__ void __launch_bounds__(1024, 4)
    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                             float const* SFScale, uint32_t* out,
                             uint32_t* SFout) {
@ -132,8 +131,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
-  int const numBlocksPerSM =
+  int const numBlocksPerSM = 2048 / block.x;
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
  VLLM_DISPATCH_HALF_TYPES(
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -14,8 +14,6 @@
 * limitations under the License.
 */
 #include "core/registration.h"
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
      "12.8 or above.");
 #endif
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
 }
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -26,13 +26,12 @@
 #include "dispatch_utils.h"
 #include "nvfp4_utils.cuh"
 #include "launch_bounds_utils.h"
 namespace vllm {
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+__global__ void __launch_bounds__(512, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -130,7 +129,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
+__global__ void __launch_bounds__(1024, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -234,9 +233,8 @@ void quant_impl(void* output, void* output_scale, void* input,
  int const workSizePerRow = k / ELTS_PER_THREAD;
  int const totalWorkSize = m_topk * workSizePerRow;
  dim3 block(std::min(workSizePerRow, 512));
-  // Get number of blocks per SM
+  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM =
+  int const numBlocksPerSM = 2048 / block.x;
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
                     multiProcessorCount * numBlocksPerSM));
  while (grid.x <= multiProcessorCount && block.x > 64) {
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -26,14 +26,13 @@
 #include "dispatch_utils.h"
 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"
 namespace vllm {
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+__global__ void __launch_bounds__(512, 4)
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
@ -76,9 +75,8 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
  // Grid, Block size.
  // Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM
+  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM =
+  int const numBlocksPerSM = 2048 / block.x;
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
  // Launch the cvt kernel.
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@ -576,17 +576,6 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
          TORCH_CHECK(false,                                                   \
                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
        }                                                                      \
      } else if (KV_DTYPE == "fp8_ds_mla") {                                   \
        if (SRC_DTYPE == at::ScalarType::Float) {                              \
          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
        } else {                                                               \
          TORCH_CHECK(false,                                                   \
                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
        }                                                                      \
      } else {                                                                 \
        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
      }                                                                        \
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@ -12,8 +12,8 @@
 #include "../vectorization_utils.cuh"
 #include "../../dispatch_utils.h"
-__device__ __forceinline__ float GroupReduceMax(float val) {
+__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
-  unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
+  unsigned mask = 0xffff;
  val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
  val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
@ -86,7 +86,7 @@ __global__ void per_token_group_quant_8bit_kernel(
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler
-  local_absmax = GroupReduceMax(local_absmax);
+  local_absmax = GroupReduceMax(local_absmax, lane_id);
  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -17,32 +17,28 @@ FILE_HEAD = """
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
-TEMPLATE = (
+TEMPLATE = ("template __global__ void Marlin<"
-    "template __global__ void Marlin<"
+            "{{scalar_t}}, "
-    "{{scalar_t}}, "
+            "{{w_type_id}}, "
-    "{{w_type_id}}, "
+            "{{s_type_id}}, "
-    "{{s_type_id}}, "
+            "{{threads}}, "
-    "{{threads}}, "
+            "{{thread_m_blocks}}, "
-    "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
-    "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
-    "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
-    "{{stages}}, "
+            "{{group_blocks}}, "
-    "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
-    "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
    "( MARLIN_KERNEL_PARAMS );"
 )
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = [
-    "vllm::kU4",
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
-    "vllm::kU4B8",
+    "vllm::kFE2M1f"
    "vllm::kU8B128",
    "vllm::kFE4M3fn",
    "vllm::kFE2M1f",
 ]
-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
                  (128, 64, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 # group_blocks:
@ -63,12 +59,11 @@ def generate_new_kernels():
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
-        ):
+
            # act order case only support gptq-int4 and gptq-int8
            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
+                    "vllm::kU4B8", "vllm::kU8B128"
                "vllm::kU8B128",
            ]:
                continue
            if thread_configs[2] == 256:
@ -98,7 +93,8 @@ def generate_new_kernels():
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
                    group_blocks == 4:
                # HQQ (is_zp_float = true) only supports
                # 4bit quantization and fp16
                is_zp_float_list.append(True)
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -12,21 +12,20 @@ from functools import reduce
 from typing import Optional, Union
 import jinja2
-from vllm_cutlass_library_extension import (
+# yapf conflicts with isort for this block
-    DataType,
+# yapf: disable
-    EpilogueScheduleTag,
+from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
-    EpilogueScheduleType,
+                                            EpilogueScheduleType,
-    MixedInputKernelScheduleType,
+                                            MixedInputKernelScheduleType,
-    TileSchedulerTag,
+                                            TileSchedulerTag,
-    TileSchedulerType,
+                                            TileSchedulerType, VLLMDataType,
-    VLLMDataType,
+                                            VLLMDataTypeNames,
-    VLLMDataTypeNames,
+                                            VLLMDataTypeSize, VLLMDataTypeTag,
-    VLLMDataTypeSize,
+                                            VLLMDataTypeTorchDataTypeTag,
-    VLLMDataTypeTag,
+                                            VLLMDataTypeVLLMScalarTypeTag,
-    VLLMDataTypeTorchDataTypeTag,
+                                            VLLMKernelScheduleTag)
-    VLLMDataTypeVLLMScalarTypeTag,
+
-    VLLMKernelScheduleTag,
+# yapf: enable
 )
 #
 #   Generator templating
@ -287,23 +286,18 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
    tile_shape = (
        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
    )
-    cluster_shape = (
+    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
-        f"{schedule_config.cluster_shape_mnk[0]}"
+                     f"x{schedule_config.cluster_shape_mnk[1]}" +
-        + f"x{schedule_config.cluster_shape_mnk[1]}"
+                     f"x{schedule_config.cluster_shape_mnk[2]}")
-        + f"x{schedule_config.cluster_shape_mnk[2]}"
+    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
-    )
+        .split("::")[-1]
-    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
+    epilogue_schedule = EpilogueScheduleTag[
-        "::"
+        schedule_config.epilogue_schedule].split("::")[-1]
-    )[-1]
+    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
-    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
+        .split("::")[-1]
        "::"
    )[-1]
    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
-    return (
+    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
-        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+            f"_{epilogue_schedule}_{tile_scheduler}")
        + f"_{epilogue_schedule}_{tile_scheduler}"
    )
 # mostly unique shorter sch_sig
@ -322,24 +316,18 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
 # unique type_name
 def generate_type_signature(kernel_types: TypeConfig):
-    return str(
+    return str("".join([
-        "".join(
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            [
+        for field in fields(TypeConfig)
-                VLLMDataTypeNames[getattr(kernel_types, field.name)]
+    ]))
                for field in fields(TypeConfig)
            ]
        )
    )
 def generate_type_option_name(kernel_types: TypeConfig):
-    return ", ".join(
+    return ", ".join([
-        [
+        f"{field.name.replace('b_', 'with_')+'_type'}=" +
-            f"{field.name.replace('b_', 'with_') + '_type'}="
+        VLLMDataTypeNames[getattr(kernel_types, field.name)]
-            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
+        for field in fields(TypeConfig)
-            for field in fields(TypeConfig)
+    ])
        ]
    )
 def is_power_of_two(n):
@ -347,6 +335,7 @@ def is_power_of_two(n):
 def to_cute_constant(value: list[int]):
    def _to_cute_constant(value: int):
        if is_power_of_two(value):
            return f"_{value}"
@ -361,11 +350,11 @@ def to_cute_constant(value: list[int]):
 def unique_schedules(impl_configs: list[ImplConfig]):
    # Use dict over set for deterministic ordering
-    return list(
+    return list({
-        {
+        sch: None
-            sch: None for impl_config in impl_configs for sch in impl_config.schedules
+        for impl_config in impl_configs
-        }.keys()
+        for sch in impl_config.schedules
-    )
+    }.keys())
 def unsigned_type_with_bitwidth(num_bits):
@ -391,7 +380,7 @@ template_globals = {
    "gen_type_sig": generate_type_signature,
    "unique_schedules": unique_schedules,
    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
-    "gen_type_option_name": generate_type_option_name,
+    "gen_type_option_name": generate_type_option_name
 }
@ -409,28 +398,23 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
    sources = []
-    sources.append(
+    sources.append((
-        (
+        "machete_mm_dispatch",
-            "machete_mm_dispatch",
+        mm_dispatch_template.render(impl_configs=impl_configs),
-            mm_dispatch_template.render(impl_configs=impl_configs),
+    ))
        )
    )
    prepack_types = []
    for impl_config in impl_configs:
-        convert_type = (
+        convert_type = impl_config.types.a \
-            impl_config.types.a
+             if impl_config.types.b_group_scale == DataType.void \
-            if impl_config.types.b_group_scale == DataType.void
+             else impl_config.types.b_group_scale
            else impl_config.types.b_group_scale
        )
        prepack_types.append(
            PrepackTypeConfig(
                a=impl_config.types.a,
                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
                convert=convert_type,
                accumulator=impl_config.types.accumulator,
-            )
+            ))
        )
    def prepacked_type_key(prepack_type: PrepackTypeConfig):
        # For now, we can just use the first accumulator type seen since
@ -446,14 +430,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            unique_prepack_types.append(prepack_type)
            prepack_types_seen.add(key)
-    sources.append(
+    sources.append((
-        (
+        "machete_prepack",
-            "machete_prepack",
+        prepack_dispatch_template.render(types=unique_prepack_types, ),
-            prepack_dispatch_template.render(
+    ))
                types=unique_prepack_types,
            ),
        )
    )
    # Split up impls across files
    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
@ -486,12 +466,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
        curr_impl_in_file += len(files_impls[-1][-1].schedules)
    for part, file_impls in enumerate(files_impls):
-        sources.append(
+        sources.append((
-            (
+            f"machete_mm_impl_part{part+1}",
-                f"machete_mm_impl_part{part + 1}",
+            mm_impl_template.render(impl_configs=file_impls),
-                mm_impl_template.render(impl_configs=file_impls),
+        ))
            )
        )
    return sources
@ -536,7 +514,8 @@ def generate():
    # For now we use the same heuristic for all types
    # Heuristic is currently tuned for H100s
    default_heuristic = [
-        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
+        (cond, ScheduleConfig(*tile_config,
                              **sch_common_params))  # type: ignore
        for cond, tile_config in default_tile_heuristic_config.items()
    ]
@ -562,18 +541,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(GPTQ_kernel_type_configs,
-            GPTQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    AWQ_kernel_type_configs = list(
@ -586,18 +561,14 @@ def generate():
            a_token_scale=DataType.void,
            out=a,
            accumulator=DataType.f32,
-        )
+        ) for b in (DataType.u4, DataType.u8)
-        for b in (DataType.u4, DataType.u8)
+        for a in (DataType.f16, DataType.bf16))
        for a in (DataType.f16, DataType.bf16)
    )
    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
-        for x in zip(
+        for x in zip(AWQ_kernel_type_configs,
-            AWQ_kernel_type_configs,
+                     itertools.repeat(get_unique_schedules(default_heuristic)),
-            itertools.repeat(get_unique_schedules(default_heuristic)),
+                     itertools.repeat(default_heuristic))
            itertools.repeat(default_heuristic),
        )
    ]
    # TODO: Support W4A8 when ready
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -25,12 +25,6 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"
 // ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
 #if !defined(HIP_FP8_TYPE_OCP)
 using __hip_fp8_e4m3 = __hip_fp8_e4m3_fnuz;
 using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
 #endif
 #if defined(__HIPCC__) && \
    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
  #define __HIP__GFX9__
@ -40,8 +34,7 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
  #define __HIP__FP8MFMA__
 #endif
-#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
+#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
                           defined(__gfx1150__) || defined(__gfx1151__))
  #define __HIP__GFX11__
 #endif
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -5,14 +5,11 @@
 torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
                    const int64_t rows_per_block);
-torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);
-void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount);
 void paged_attention(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -292,9 +292,8 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_sml_(const int K, const int M, const int Bx, const int By,
+    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
-                     const scalar_t* B, const scalar_t* __restrict__ A,
+                     const scalar_t* __restrict__ A, scalar_t* C,
                     const scalar_t* __restrict__ BIAS, scalar_t* C,
                     const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -485,14 +484,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if constexpr (std::is_same_v<scalar_t, half>) {
+            // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
              if (BIAS)
                sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
            } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
              if (BIAS)
                sum[n][i] +=
                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
            }
            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
          }
        }
@ -537,9 +529,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (BIAS)
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
              sum4[n][i][0] +=
                  __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
          }
        }
@ -551,10 +541,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
-                                 const int By, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
                                 const scalar_t* __restrict__ A,
                                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                                 const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -565,9 +553,8 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_(const int K, const int M, const int Bx, const int By,
+    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
-                 const scalar_t* B, const scalar_t* __restrict__ A,
+                 const scalar_t* __restrict__ A, scalar_t* C,
                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                 const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -785,17 +772,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+            if (commitColumn[i])
              if constexpr (std::is_same_v<scalar_t, half>) {
                if (BIAS)
                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
              } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
                if (BIAS)
                  sum[n][i] +=
                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
              }
              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
            }
          }
        }
      }
@ -840,12 +818,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
-              if (BIAS)
+            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
                sum4[n][i][0] +=
                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
            }
          }
        }
      }
@ -868,10 +842,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
+__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
-                             const int By, const scalar_t* B,
+                             const scalar_t* __restrict__ A, scalar_t* C,
                             const scalar_t* __restrict__ A,
                             const scalar_t* __restrict__ BIAS, scalar_t* C,
                             const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -882,9 +854,8 @@ __global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_big_(const int K, const int M, const int Bx, const int By,
+    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
-                     const scalar_t* B, const scalar_t* __restrict__ A,
+                     const scalar_t* __restrict__ A, scalar_t* C,
                     const scalar_t* __restrict__ BIAS, scalar_t* C,
                     const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -1153,17 +1124,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+            if (commitColumn[i])
              if constexpr (std::is_same_v<scalar_t, half>) {
                if (BIAS)
                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
              } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
                if (BIAS)
                  sum[n][i] +=
                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
              }
              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
            }
          }
        }
      }
@ -1204,12 +1166,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
-              if (BIAS)
+            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
                sum4[n][i][0] +=
                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
            }
          }
        }
      }
@ -1232,10 +1190,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
+__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
-                                 const int By, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
                                 const scalar_t* __restrict__ A,
                                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                                 const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -1270,20 +1226,11 @@ int mindiv(int N, int div1, int div2) {
  return rtn;
 }
-torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount) {
  auto M_in = in_a.size(0);
  auto K_in = in_a.size(1);
  auto N_in = in_b.size(0);
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
          : 1;
  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
                in_bias->sizes().size() == 2)
                   ? in_bias->size(0)
                   : 1;
  TORCH_CHECK(in_a.dtype() == in_b.dtype());
  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
@ -1307,18 +1254,18 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
+                                       CuCount);                              \
    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
+                                       CuCount);                              \
    } else {                                                                  \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
+                                       CuCount);                              \
    }                                                                         \
  }
@ -1326,10 +1273,6 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
    using fptype = typename scalar<scalar_t>::type;
    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
    const fptype* biasf4 =
        (in_bias.has_value() && in_bias->numel() > 0)
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    switch (N_in) {
      case 1:
@ -1357,9 +1300,8 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
-                      const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+                      const fp8_t* __restrict__ A, scalar_t* C,
                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const float* __restrict__ s_A,
                      const float* __restrict__ s_B, const int _WvPrGrp,
                      const int CuCount) {
@ -1511,17 +1453,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    if (threadIdx.x == 0) {
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
-          if (y + m >= M) break;  // To avoid mem access fault.
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
          sum[n][y][0] *= sA * sB;
          if constexpr (std::is_same_v<scalar_t, half>) {
            if (BIAS)
              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
            if (BIAS)
              sum[n][y][0] +=
                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
          }
          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);  // * sA * sB);
        }
      }
    }
@ -1533,9 +1465,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
-                                  const int Bx, const int By, const fp8_t* B,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
                                  const fp8_t* __restrict__ A,
                                  const scalar_t* __restrict__ BIAS,
                                  scalar_t* C, const float* __restrict__ s_A,
                                  const float* __restrict__ s_B,
                                  const int _WvPrGrp, const int CuCount) {
@ -1547,9 +1477,8 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
-                  const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+                  const fp8_t* __restrict__ A, scalar_t* C,
                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                  const float* __restrict__ s_A, const float* __restrict__ s_B,
                  const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE;
@ -1697,16 +1626,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
-          sum[n][y][0] *= sA * sB;
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
          if constexpr (std::is_same_v<scalar_t, half>) {
            if (BIAS)
              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
            if (BIAS)
              sum[n][y][0] +=
                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
          }
          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
      }
    }
@ -1718,19 +1638,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
-                              const int Bx, const int By, const fp8_t* B,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
-                              const fp8_t* __restrict__ A,
+                              scalar_t* C, const float* __restrict__ s_A,
                              const scalar_t* __restrict__ BIAS, scalar_t* C,
                              const float* __restrict__ s_A,
                              const float* __restrict__ s_B, const int _WvPrGrp,
                              const int CuCount) {
  UNREACHABLE_CODE
 }
 #endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
-void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
                                        ? c10::ScalarType::Float8_e4m3fn
@ -1739,15 +1656,6 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
  auto K_in = in_a.size(1);
  auto N_in = in_b.size(0);
  auto Kp_in = in_a.stride(0);
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
          : 1;
  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
                in_bias->sizes().size() == 2)
                   ? in_bias->size(0)
                   : 1;
  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
@ -1765,15 +1673,13 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
                                       __wvPrGrp, CuCount);                    \
    } else {                                                                   \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
                                       __wvPrGrp, CuCount);                    \
    }                                                                          \
  }
@ -1785,9 +1691,6 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
      auto a_ptr = in_a.data_ptr<fp8_t>();
      auto b_ptr = in_b.data_ptr<fp8_t>();
      auto bias_ptr = (in_bias.has_value() && in_bias->numel() > 0)
                          ? reinterpret_cast<fptype*>(in_bias->data_ptr())
                          : nullptr;
      switch (N_in) {
        case 1:
          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -22,14 +22,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
  // Custom gemm op for skinny matrix-matrix multiplication
  rocm_ops.def(
-      "wvSplitK(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
+      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
      "Tensor");
  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
  // wvSplitK for fp8
  rocm_ops.def(
-      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
      "Tensor scale_a, "
      "          Tensor scale_b, int CuCount) -> ()");
  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
      {stride_tag});
-  // conditionally compiled so impl registration is in source file
+  ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization, as well as bias
@ -713,13 +713,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
  cache_ops.def(
      "indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
      "slot_mapping, "
      "int quant_block_size, str kv_cache_dtype) -> ()");
  cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
                 &indexer_k_quant_and_cache);
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -14,11 +14,6 @@ ARG PYTHON_VERSION=3.12
 #
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 # Important: We build with an old version of Ubuntu to maintain broad 
 # compatibility with other Linux OSes. The main reason for this is that the
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
@ -80,19 +75,34 @@ ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
-# Install system dependencies and uv, then create Python virtual environment
+# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
+    && apt-get install -y ccache software-properties-common git curl sudo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
-    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
        fi ; \
    else \
        for i in 1 2 3; do \
            add-apt-repository -y ppa:deadsnakes/ppa && break || \
            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
        done ; \
    fi \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 ARG PIP_INDEX_URL UV_INDEX_URL
@ -101,9 +111,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-# Activate virtual environment and add uv to PATH
+# Install uv for faster pip installs
-ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
+RUN --mount=type=cache,target=/root/.cache/uv \
-ENV VIRTUAL_ENV="/opt/venv"
+    python3 -m pip install uv
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -132,7 +142,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --system -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # cuda arch list used by torch
@ -162,7 +172,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    uv pip install --system -r requirements/build.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 COPY . .
@ -259,7 +269,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+    uv pip install --system -r requirements/dev.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
@ -381,32 +391,19 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
        ${FLASHINFER_GIT_REPO} flashinfer
    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
    if [[ "${CUDA_VERSION}" == 11.* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
    else
        # CUDA 12.8+ supports 10.0a and 12.0
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
    fi
    pushd flashinfer
-        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
+        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
+            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
+            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
+            if [[ "${CUDA_VERSION}" == 11.* ]]; then
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-                # Download pre-compiled cubins
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            else
-                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
+                # CUDA 12.8+ supports 10.0a and 12.0
                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
            fi
        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
            # Build AOT kernels
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                python3 -m flashinfer.aot
@ -446,7 +443,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 
 COPY tools/install_gdrcopy.sh install_gdrcopy.sh
 RUN set -eux; \
@ -464,12 +461,6 @@ ENV CUDA_HOME=/usr/local/cuda
 RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
    && bash install_python_libraries.sh
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
 # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
 # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
 #################### vLLM installation IMAGE ####################
 #################### TEST IMAGE ####################
@ -542,7 +533,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
 ENV VLLM_USAGE_SOURCE production-docker-image
@ -555,5 +546,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 FROM vllm-openai-base AS vllm-openai
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -47,7 +47,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV UV_HTTP_TIMEOUT=500
-# Install Python dependencies
+# Install Python dependencies 
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps
@ -114,10 +114,13 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cpu-test.txt
+    uv pip install -r requirements/cpu-test.txt 
 ######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev
@ -130,12 +133,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils 
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py develop
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
 COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
@ -160,12 +163,11 @@ ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
 ADD ./.buildkite/ ./.buildkite/
 # Create symlink for vllm-workspace to maintain CI compatibility
 RUN ln -sf /workspace /vllm-workspace
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils 
 ENTRYPOINT ["bash"]
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai
@ -177,4 +179,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -6,7 +6,7 @@ ARG CUDA_VERSION=12.8.0
 #
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.8.0
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -314,4 +314,4 @@ WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -29,10 +29,7 @@ ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
-	    && git checkout FETCH_HEAD \
+	    && git checkout FETCH_HEAD
        && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
               git remote add upstream "https://github.com/vllm-project/vllm.git" \
               && git fetch upstream ; fi
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,23 +1,25 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
-ARG TRITON_BRANCH="f9e5bf54"
+ARG HIPBLASLT_BRANCH="aa0bda7b"
-ARG TRITON_REPO="https://github.com/ROCm/triton.git"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
-ARG PYTORCH_BRANCH="b2fb6885"
+ARG LEGACY_HIPBLASLT_OPTION=
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
 ARG PYTORCH_BRANCH="f717b2af"
 ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="0e60e394"
+ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="2ab9f4cd"
+ARG AITER_BRANCH="4822e675"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 FROM ${BASE_IMAGE} AS base
-ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 ARG PYTHON_VERSION=3.12
@ -43,6 +45,29 @@ RUN apt-get update -y \
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
 RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
 RUN cd hipBLAS-common \
    && git checkout ${HIPBLAS_COMMON_BRANCH} \
    && mkdir build \
    && cd build \
    && cmake .. \
    && make package \
    && dpkg -i ./*.deb
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
    && apt-get install -y llvm-dev \
    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@ -65,6 +90,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
    pip install -r requirements.txt && git submodule update --init --recursive \
@ -75,20 +102,14 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install
 FROM base AS build_fa
 ARG FA_BRANCH
 ARG FA_REPO
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install
 FROM base AS build_aiter
 ARG AITER_BRANCH
@ -100,15 +121,15 @@ RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    cp /install/*.deb /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
@ -117,10 +138,24 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 FROM base AS final
-RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    pip install /install/*.whl
 ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@ -132,6 +167,9 @@ ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -139,6 +177,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -309,4 +309,4 @@ USER 2000
 WORKDIR /home/vllm
 # Set the default entrypoint
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.LoRAConfig][]
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
- [vllm.config.StructuredOutputsConfig][]
+- [vllm.config.DecodingConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
@ -46,6 +46,7 @@ Engine classes for offline and online inference.
 Inference parameters for vLLM APIs.
 [](){ #sampling-params }
 [](){ #pooling-params }
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
--- a/docs/api/vllm/.meta.yml
+++ b/docs/api/vllm/.meta.yml
@ -1,2 +1,2 @@
 search:
-  exclude: true
+  boost: 0.5
--- a/docs/assets/deployment/hf-inference-endpoints-catalog.png
+++ b/docs/assets/deployment/hf-inference-endpoints-catalog.png
--- a/docs/assets/deployment/hf-inference-endpoints-choose-infra.png
+++ b/docs/assets/deployment/hf-inference-endpoints-choose-infra.png
--- a/docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png
+++ b/docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png
--- a/docs/assets/deployment/hf-inference-endpoints-configure-container.png
+++ b/docs/assets/deployment/hf-inference-endpoints-configure-container.png
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chen Zhang	6e1e31a66a	add code owner Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-09-17 22:19:17 -07:00
Chen Zhang	50e80db4ef	add mixin Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-09-17 22:15:10 -07:00
Chen Zhang	d3d6afb355	add mixin Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-09-17 22:12:41 -07:00
Chen Zhang	808fa43d76	add mixin Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2025-09-17 22:02:15 -07:00
`@ -314,4 +314,4 @@ WORKDIR /workspace/`

	`RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks`	`RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks`

	`ENTRYPOINT ["vllm", "serve"]`	`ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]`
`@ -1,2 +1,2 @@`
	`search:`	`search:`
	`exclude: true`	`boost: 0.5`