Use uv to install python in Dockerfile

Signed-off-by: mgoin <mgoin64@gmail.com>
Change size of single CUDA graph for CI to 4 (#26089 )
2025-10-02 11:05:47 -04:00 · 2025-10-02 14:14:28 +00:00 · 2025-10-02 05:48:04 -07:00 · 2025-10-01 20:59:13 -07:00 · 2025-10-01 19:31:39 -07:00 · 2025-10-01 19:23:14 -07:00
252 changed files with 1930 additions and 4088 deletions
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -181,14 +181,18 @@ launch_vllm_server() {
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
-    server_command="vllm serve $model \
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
+        --model $model \
        --port $port \
        $server_args"
  fi
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -365,7 +365,8 @@ run_serving_tests() {
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs python3 \
+      -m vllm.entrypoints.openai.api_server \
      $server_args"

    # run the server
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -1,191 +0,0 @@
-#!/bin/bash
-
-# This script build the Ascend NPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Base ubuntu image with basic ascend development libraries and python installed
-VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
-CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
-TEST_RUN_CONFIG_FILE="vllm_test.cfg"
-VLLM_ASCEND_TMP_DIR=
-# Get the test run configuration file from the vllm-ascend repository
-fetch_vllm_test_cfg() {
-    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
-    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
-    cleanup() {
-        rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    }
-    trap cleanup EXIT
-
-    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
-    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
-        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-
-    # If the file already exists locally, just overwrite it
-    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
-    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
-
-    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
-    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
-    rm -rf "${VLLM_ASCEND_TMP_DIR}"
-    trap - EXIT
-}
-
-# Downloads test run configuration file from a remote URL.
-# Loads the configuration into the current script environment.
-get_config() {
-    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
-        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
-        exit 1
-    fi
-    source "${TEST_RUN_CONFIG_FILE}"
-    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
-    return 0
-}
-
-# get test running configuration.
-fetch_vllm_test_cfg
-get_config
-# Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
-  exit 1
-fi
-
-image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
-container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
-agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
-echo "agent_idx: ${agent_idx}"
-builder_name="cachebuilder${agent_idx}"
-builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
-
-# Try building the docker image
-cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
-FROM ${BASE_IMAGE_NAME}
-
-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
-    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
-    apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install for pytest to make the docker build cache layer always valid
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install pytest>=6.0  modelscope
-
-WORKDIR /workspace/vllm
-
-# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements/common.txt
-
-COPY . .
-
-# Install vLLM
-RUN --mount=type=cache,target=/root/.cache/pip \
-    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
-    python3 -m pip uninstall -y triton
-
-# Install vllm-ascend
-WORKDIR /workspace
-ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
-ARG VLLM_ASCEND_TAG=main
-RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
-    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
-
-# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r /workspace/vllm-ascend/requirements.txt
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
-    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
-    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV VLLM_USE_MODELSCOPE=True
-
-WORKDIR /workspace/vllm-ascend
-
-CMD ["/bin/bash"]
-
-EOF
-
-# Setup cleanup
-remove_docker_container() {
-  docker rm -f "${container_name}" || true;
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
-}
-trap remove_docker_container EXIT
-
-# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
-# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
-#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
-parse_and_gen_devices() {
-    local input="$1"
-    local index cards_num
-    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
-        index="${BASH_REMATCH[1]}"
-        cards_num="${BASH_REMATCH[2]}"
-    else
-        echo "parse error" >&2
-        return 1
-    fi
-
-    local devices=""
-    local i=0
-    while (( i < cards_num )); do
-        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
-        ((i++))
-    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
-}
-
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
-
-# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
-# This test checks whether the OOT platform interface is functioning properly in conjunction with
-# the hardware plugin vllm-ascend.
-model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
-docker run \
-    ${devices} \
-    --device /dev/davinci_manager \
-    --device /dev/devmm_svm \
-    --device /dev/hisi_hdc \
-    -v /usr/local/dcmi:/usr/local/dcmi \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
-    --entrypoint="" \
-    --name "${container_name}" \
-    "${image_name}" \
-    bash -c '
-    set -e
-    pytest -v -s tests/e2e/vllm_interface/
-'
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 bench_throughput_exit_code=$?

 # run server-based benchmarks and upload the result to buildkite
-vllm serve meta-llama/Llama-2-7b-chat-hf &
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -2,7 +2,6 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
-    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@ -15,7 +14,6 @@ pull_request_rules:
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
-    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@ -32,7 +30,6 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@ -49,7 +46,6 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
-    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@ -59,7 +55,6 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@ -75,7 +70,6 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@ -89,7 +83,6 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
-    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@ -101,7 +94,6 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
@ -115,7 +107,6 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@ -130,7 +121,6 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@ -152,7 +142,6 @@ pull_request_rules:
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
-    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@ -173,7 +162,6 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
-    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@ -193,7 +181,6 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@ -209,7 +196,6 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
-    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@ -222,7 +208,6 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
-    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@ -238,7 +223,6 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
-    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@ -253,7 +237,6 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
-    - label != stale
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
@ -272,9 +255,8 @@ pull_request_rules:

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-    - label != stale
-    - conflict
-    - -closed
+      - conflict
+      - -closed
  actions:
    label:
      add:
@ -288,8 +270,6 @@ pull_request_rules:

 - name: assign reviewer for tensorizer changes
  conditions:
-    - label != stale
-    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
@ -301,7 +281,6 @@ pull_request_rules:

 - name: assign reviewer for modelopt changes
  conditions:
-    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -316,8 +295,8 @@ pull_request_rules:

 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-    - -conflict
-    - -closed
+      - -conflict
+      - -closed
  actions:
    label:
      remove:
@ -326,7 +305,6 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
-    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/w8a8/int8/scaled_quant.cu"
-  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
@ -314,13 +314,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
-    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
-    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
+    "csrc/quantization/fp8/per_token_group_quant.cu")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -424,11 +423,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -459,9 +458,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -493,9 +492,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -526,7 +525,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -649,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -668,12 +667,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -698,7 +697,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -721,7 +720,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -55,7 +55,9 @@ benchmark() {
  output_len=$2


-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -38,12 +38,16 @@ wait_for_server() {
 launch_chunked_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
    --gpu-memory-utilization 0.6 &
-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
  # disagg prefill
-  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8100 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

-  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
    --port 8200 \
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import random
-import time
-
-import torch
-from tabulate import tabulate
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import (
-    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
-    create_kv_caches_with_random,
-)
-
-logger = init_logger(__name__)
-
-
-@torch.inference_mode()
-def run_benchmark(
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: str,
-    num_iters: int,
-    benchmark_mode: str,
-    device: str = "cuda",
-) -> float:
-    """Return latency (seconds) for given num_tokens."""
-
-    if kv_cache_dtype == "fp8" and head_size % 16:
-        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
-
-    current_platform.seed_everything(42)
-    torch.set_default_device(device)
-
-    # create random key / value tensors [T, H, D].
-    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
-    value = torch.randn_like(key)
-
-    # prepare the slot mapping.
-    # each token is assigned a unique slot in the KV-cache.
-    num_slots = block_size * num_blocks
-    if num_tokens > num_slots:
-        raise ValueError("num_tokens cannot exceed the total number of cache slots")
-    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
-
-    key_caches, value_caches = create_kv_caches_with_random(
-        num_blocks,
-        block_size,
-        1,  # num_layers
-        num_heads,
-        head_size,
-        kv_cache_dtype,
-        dtype,
-        device=device,
-    )
-    key_cache, value_cache = key_caches[0], value_caches[0]
-    # to free unused memory
-    del key_caches, value_caches
-
-    # compute per-kernel scaling factors for fp8 conversion (if used).
-    k_scale = (key.amax() / 64.0).to(torch.float32)
-    v_scale = (value.amax() / 64.0).to(torch.float32)
-
-    function_under_test = lambda: ops.reshape_and_cache(
-        key,  # noqa: F821
-        value,  # noqa: F821
-        key_cache,  # noqa: F821
-        value_cache,  # noqa: F821
-        slot_mapping,  # noqa: F821
-        kv_cache_dtype,
-        k_scale,
-        v_scale,
-    )
-
-    if benchmark_mode == "cudagraph":
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            function_under_test()
-        torch.cuda.synchronize()
-        function_under_test = lambda: g.replay()
-
-    def run_cuda_benchmark(n_iters: int) -> float:
-        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
-        start = time.perf_counter()
-        for _ in range(n_iters):
-            function_under_test()
-            torch.cuda.synchronize()
-        end = time.perf_counter()
-        return (end - start) / n_iters
-
-    # warm-up
-    run_cuda_benchmark(3)
-
-    lat = run_cuda_benchmark(num_iters)
-
-    # free tensors to mitigate OOM when sweeping
-    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
-
-    return lat
-
-
-def main(args):
-    rows = []
-    for exp in range(1, 17):
-        n_tok = 2**exp
-        lat = run_benchmark(
-            num_tokens=n_tok,
-            num_heads=args.num_heads,
-            head_size=args.head_size,
-            block_size=args.block_size,
-            num_blocks=args.num_blocks,
-            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-            kv_cache_dtype=args.kv_cache_dtype,
-            num_iters=args.iters,
-            benchmark_mode=args.mode,
-            device="cuda",
-        )
-        rows.append([n_tok, lat * 1e6])  # convert to microseconds
-
-    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
-    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-
-    parser.add_argument("--num-heads", type=int, default=128)
-    parser.add_argument(
-        "--head-size",
-        type=int,
-        choices=[64, 80, 96, 112, 120, 128, 192, 256],
-        default=128,
-    )
-    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
-    parser.add_argument("--num-blocks", type=int, default=128 * 128)
-
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        choices=["half", "bfloat16", "float"],
-        default="bfloat16",
-    )
-
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        choices=["auto", "fp8"],
-        default="auto",
-    )
-
-    parser.add_argument("--iters", type=int, default=200)
-
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["cudagraph", "no_graph"],
-        default="cudagraph",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -9,7 +9,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
-    w8a8_triton_block_scaled_mm,
+    w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
@ -63,7 +63,7 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_triton_block_scaled_mm(A_vllm,
+        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@ -28,10 +28,10 @@

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
-  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
+  #include "../quantization/fp8/amd/quant_utils.cuh"
 typedef __hip_bfloat16 __nv_bfloat16;
 #else
-  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+  #include "../quantization/fp8/nvidia/quant_utils.cuh"
 #endif

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -9,14 +9,16 @@
 #include "quantization/vectorization_utils.cuh"

 #ifdef USE_ROCM
-  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
+  #include "quantization/fp8/amd/quant_utils.cuh"
 #else
-  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+  #include "quantization/fp8/nvidia/quant_utils.cuh"
 #endif

 #include <algorithm>
 #include <cassert>
-#include <cfloat>
+#include <cfloat>  // FLT_MIN
+#include <map>
+#include <vector>

 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,

 namespace vllm {

-// Used to copy/convert one element
-template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
-struct CopyWithScaleOp {
-  float scale;
-
-  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      dst = static_cast<OutT>(src);
-    } else {
-      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
-    }
-  }
-};
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
+    // Padding token that should be ignored.
    return;
  }

  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
-  const int h_block_count = head_size / x;  // head_size//x

-  const int h_block_idx = threadIdx.x;
-  if (h_block_idx >= num_heads * h_block_count) {
-    return;
-  }
+  const int n = num_heads * head_size;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;

-  const int head_idx = h_block_idx / h_block_count;
-  const int h_block = h_block_idx % h_block_count;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int x_idx = head_offset / x;
+    const int x_offset = head_offset % x;

-  const scalar_t* __restrict__ key_src =
-      key + token_idx * key_stride + head_idx * head_size + h_block * x;
-  const int64_t src_value_start =
-      token_idx * value_stride + head_idx * head_size + h_block * x;
-
-  cache_t* __restrict__ key_dst =
-      key_cache + block_idx * num_heads * h_block_count * block_size * x +
-      head_idx * h_block_count * block_size * x + h_block * block_size * x +
-      block_offset * x;
-  const int64_t tgt_value_start =
-      block_idx * num_heads * h_block_count * x * block_size +
-      head_idx * h_block_count * x * block_size + h_block * x * block_size +
-      block_offset;
-
-  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-
-  vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
-
-  const scalar_t* __restrict__ value_src = value + src_value_start;
-  cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
-#pragma unroll
-  for (int i = 0; i < x; i++) {
-    v_op(value_dst[i * block_size], value_src[i]);
+    const int64_t tgt_key_idx =
+        block_idx * num_heads * (head_size / x) * block_size * x +
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
+        block_offset * x + x_offset;
+    const int64_t tgt_value_idx =
+        block_idx * num_heads * head_size * block_size +
+        head_idx * head_size * block_size + head_offset * block_size +
+        block_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_idx] = tgt_key;
+      value_cache[tgt_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
+      value_cache[tgt_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+    }
  }
 }

+// Used by vectorization_utils to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
@ -428,81 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
  const int64_t dst_idx_start =
      block_idx * block_stride + block_offset * entry_stride;

-  // For the NoPE part, each tile of 128 elements is handled by half of one warp
-  // (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-  // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-  // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
-  // So in total, we use 3 warps (96 threads) per block.
+  // Create 4 tile scales in shared memory
+  __shared__ float smem[20];
+  float* shard_abs_max = smem;
+  float* tile_scales = smem + 16;
+
+  // For the NoPE part, each tile of 128 elements is handled by 4 warps
+  // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+  // The first thread of the first warp in each tile writes the scale
+  // value for the tile. The RoPE part (last 64 elements) is handled
+  // by another 2 warps (64 threads).
+  // So in total, we use 18 warps (576 threads) per block.

  // Cast kv_cache to 16_bit for RoPE values
  scalar_t* kv_cache_16bit =
      reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);

-  // The last warp handles the RoPE part
-  if (threadIdx.x >= 64) {
-    // Each thread handles two elements of RoPE
-    const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
-    const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
-    // Vectorized load of two 16-bit values, performed as one 32-bit load
-    const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
+  // The last 64 threads handle the RoPE part
+  if (threadIdx.x >= kv_lora_rank) {
+    const int8_t pe_idx = threadIdx.x - kv_lora_rank;
+    const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
    // RoPE values start after the packed 8-bit NoPE values and the
    // 32-bit scales
-    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
-    // Vectorized store of two 16-bit values, performed as one 32-bit store
-    *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
+    const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
+    kv_cache_16bit[dst_idx] = k_pe[src_idx];
    return;
  }

-  // The first two warps handle the NoPE part
-  const int8_t warp_idx = threadIdx.x >> 5;
-  const int8_t lane_idx = threadIdx.x & 31;
-  const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
+  // Determine the scale for each chunk of NoPE
+  const int16_t tile_idx = threadIdx.x >> 7;
+  const int16_t warp_idx = (threadIdx.x & 127) >> 5;
+  const int16_t lane_idx = threadIdx.x & 31;

-  // Each thread handles 8 elements of NoPE
-  // Load the NoPE elements for this thread into registers
-  const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
-  // Vectorized load of eight 16-bit values, performed as an int4 load
-  const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
-  const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
+  // Load the NoPE element for this thread into registers
+  const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
+  const scalar_t src_val = kv_c[src_idx];

-  // Max absolute value of this thread's elements
-  float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
-                              fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
-                        fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
-                              fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
-
-  // Warp-level reduction to find the max absolute value in each half-warp
+  // Warp-level reduction to find the max absolute value in the warp
+  float max_abs = fabsf(src_val);
 #pragma unroll
-  for (int offset = 8; offset > 0; offset /= 2) {
-    max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
+  for (int offset = 16; offset > 0; offset /= 2) {
+#ifdef USE_ROCM
+    max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
+#else
+    max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
+#endif
  }

-  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
+  // The first lane of each warp in each tile writes the max_abs of this part
+  // of the tile to shared memory
+  if (lane_idx == 0) {
+    shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
+  }
+  __syncthreads();

-  // The first lane of each half-warp writes the scale to kv_cache
-  if ((lane_idx == 0) || (lane_idx == 16)) {
+  // The first lane of the first warp in each tile computes the scale for the
+  // tile and writes it to shared memory and to kv_cache
+  if (warp_idx == 0 && lane_idx == 0) {
+    float4 shard_abs_max_vec =
+        reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
+    float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
+                             fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
+                       448.f;
+
+    // Avoid division by zero in `scaled_convert`
+    tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
    float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
    const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
-    kv_cache_32bit[dst_idx] = tile_scale;
+    kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
  }

-  // Now all threads in the block scale and write their elements
-  // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
-  const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
+  __syncthreads();

-  uint8_t result[8];
-#pragma unroll
-  for (int i = 0; i < 8; i++) {
-    result[i] =
-        fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
-            vals[i], tile_scale);
-  }
-
-  // Store as aligned 64-bit writes
-  *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
-      *reinterpret_cast<const uint64_t*>(result);
+  // Now all threads in the block scale and write their element
+  const float scale_val = tile_scales[tile_idx];
+  const int64_t dst_idx = dst_idx_start + threadIdx.x;
+  kv_cache[dst_idx] =
+      fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
+          src_val, scale_val);
 }

 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -607,10 +606,9 @@ void reshape_and_cache(

  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
-  int head_div_x = head_size / x;

  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_div_x, 512));
+  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

@ -743,12 +741,13 @@ void concat_and_cache_mla(

  if (kv_cache_dtype == "fp8_ds_mla") {
    dim3 grid(num_tokens);
-    // For the NoPE part, each tile of 128 elements is handled by half of one
-    // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
-    // Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
-    // The RoPE part (last 64 elements) is handled by another 1 warp (32
-    // threads). So in total, we use 3 warps (96 threads) per block.
-    dim3 block(96);
+    // For the NoPE part, each tile of 128 elements is handled by 4 warps
+    // (128 threads). There are 4 total tiles, so 16 warps (512 threads).
+    // The first thread of the first warp in each tile writes the scale
+    // value for the tile. The RoPE part (last 64 elements) is handled
+    // by another 2 warps (64 threads).
+    // So in total, we use 18 warps (576 threads) per block.
+    dim3 block(576);
    DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
                               CALL_CONCAT_AND_CACHE_DS_MLA);
  } else {
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@ -12,7 +12,6 @@ using CubMaxOp = cub::Max;
  #endif  // CUB_VERSION
 #else
  #include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-using CubAddOp = hipcub::Sum;
-using CubMaxOp = hipcub::Max;
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
 #endif  // USE_ROCM
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -6,7 +6,7 @@
 */

 #include "type_convert.cuh"
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -7,7 +7,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"

-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"

 #include <c10/util/Float8_e4m3fn.h>

--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,11 +1,15 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>

+#ifndef USE_ROCM
+  #include "../per_token_group_quant_8bit.h"
+#endif
+
 #include <cmath>

-#include "dispatch_utils.h"
-#include "quantization/vectorization_utils.cuh"
-#include "cub_helpers.h"
+#include "../../cub_helpers.h"
+#include "../../dispatch_utils.h"
+#include "../vectorization_utils.cuh"

 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@ -21,6 +25,7 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
  float dst = std::nearbyint(x);

  // saturate
+
  // See https://github.com/pytorch/pytorch/issues/127666
  // See https://github.com/llvm/llvm-project/issues/95183
  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -79,6 +84,7 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
      static_cast<int32_t>(std::numeric_limits<int8_t>::max());

  // saturate
+
  // See https://github.com/pytorch/pytorch/issues/127666
  // See https://github.com/llvm/llvm-project/issues/95183
  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
@ -170,6 +176,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(

  float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;

+  // 2. quantize
  vectorize_with_alignment<16>(
      row_in, row_out, hidden_size, tid, stride,
      [=] __device__(int8_t& dst, const scalar_t& src) {
@ -187,6 +194,7 @@ struct MinMax {

  __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}

+  // add a value to the MinMax
  __host__ __device__ MinMax& operator+=(float v) {
    min = fminf(min, v);
    max = fmaxf(max, v);
@ -220,6 +228,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
  const scalar_t* row_in = input + token_idx * hidden_size;
  int8_t* row_out = output + token_idx * hidden_size;

+  // 1. calculate min & max
  MinMax thread_mm;
  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
                                    [&] __device__(const scalar_t& src) {
@ -252,6 +261,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
  const float inv_s = 1.f / scale_sh;
  const azp_t azp = azp_sh;

+  // 2. quantize
  vectorize_with_alignment<16>(
      row_in, row_out, hidden_size, tid, stride,
      [=] __device__(int8_t& dst, const scalar_t& src) {
@ -322,4 +332,14 @@ void dynamic_scaled_int8_quant(
                  hidden_size);
        }
      });
-}
+}
+
+#ifndef USE_ROCM
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             int8_min, int8_max);
+}
+#endif
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -254,7 +254,7 @@ void cutlass_moe_mm(
    bool per_act_token, bool per_out_ch) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100 && version_num < 110) {
+  if (version_num >= 100) {
    cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
@ -262,7 +262,7 @@ void cutlass_moe_mm(
  }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90 && version_num < 100) {
+  if (version_num >= 90) {
    cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
                        c_strides, per_act_token, per_out_ch);
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -14,8 +14,6 @@
 * limitations under the License.
 */

-#include "core/registration.h"
-
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
      "12.8 or above.");
 #endif
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
-}
--- a/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
@ -5,7 +5,7 @@
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>

-#include "../../../../attention/attention_dtypes.h"
+#include "../../../attention/attention_dtypes.h"

 namespace vllm {
 #ifdef USE_ROCM
--- a/csrc/quantization/w8a8/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
-#include "cub_helpers.h"
-#include "quantization/vectorization_utils.cuh"
+#include "../../cub_helpers.h"
+#include "../vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>

--- a/csrc/quantization/w8a8/fp8/common.cuh
+++ b/csrc/quantization/w8a8/fp8/common.cuh
--- a/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
@ -1,6 +1,6 @@
 #pragma once

-#include "../../../../attention/attention_dtypes.h"
+#include "../../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@ -1,6 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>

-#include "quantization/w8a8/per_token_group_quant_8bit.h"
+#include "../per_token_group_quant_8bit.h"

 #include <cmath>

@ -8,9 +8,9 @@

 #include <torch/all.h>

-#include "quantization/vectorization.cuh"
-#include "quantization/vectorization_utils.cuh"
-#include "dispatch_utils.h"
+#include "../vectorization.cuh"
+#include "../vectorization_utils.cuh"
+#include "../../dispatch_utils.h"

 __device__ __forceinline__ float GroupReduceMax(float val) {
  unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
@ -212,4 +212,4 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                               double fp8_max, bool scale_ue8m0) {
  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
                             fp8_min, fp8_max, scale_ue8m0);
-}
+}
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@ -6,7 +6,7 @@

 #include "quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"

 namespace vllm {

--- a/csrc/quantization/w8a8/per_token_group_quant_8bit.h
+++ b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
@ -1,6 +1,7 @@
 #pragma once
 #include <torch/all.h>

+// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                torch::Tensor& output_q,
--- a/csrc/quantization/w8a8/int8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
@ -1,12 +0,0 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-
-#include "quantization/w8a8/per_token_group_quant_8bit.h"
-
-void per_token_group_quant_int8(const torch::Tensor& input,
-                                torch::Tensor& output_q,
-                                torch::Tensor& output_s, int64_t group_size,
-                                double eps, double int8_min, double int8_max) {
-  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
-                             int8_min, int8_max);
-}
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -23,7 +23,7 @@

 #include <algorithm>
 #include "../attention/dtype_fp8.cuh"
-#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
+#include "../quantization/fp8/amd/quant_utils.cuh"

 // ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
 #if !defined(HIP_FP8_TYPE_OCP)
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -11,7 +11,7 @@

 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
-#include "quantization/w8a8/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"

 #if defined(__HIPCC__) && \
    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
      {stride_tag});
-  // conditionally compiled so impl registration is in source file
+  ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);

  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization, as well as bias
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -13,13 +13,8 @@ ARG PYTHON_VERSION=3.12
 # private registries that use a different repository naming conventions.
 #
 # Example:
-# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-
-# Important: We build with an old version of Ubuntu to maintain broad 
-# compatibility with other Linux OSes. The main reason for this is that the
-# glibc version is baked into the distro, and binaries built with one glibc
-# version are not backwards compatible with OSes that use an earlier version.
-ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04

@ -80,20 +75,15 @@ ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive

+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL

-# Install system dependencies and uv, then create Python virtual environment
+# Install minimal dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
-    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y ccache software-properties-common git curl sudo

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -101,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Activate virtual environment and add uv to PATH
-ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
-ENV VIRTUAL_ENV="/opt/venv"
+# Install uv and Python
+COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
+RUN uv python install ${PYTHON_VERSION} --default --verbose

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -132,7 +122,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --system -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # cuda arch list used by torch
@ -162,7 +152,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    uv pip install --system -r requirements/build.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 COPY . .
@ -259,7 +249,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+    uv pip install --system -r requirements/dev.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################

@ -286,32 +276,12 @@ ARG GET_PIP_URL
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install Python and other dependencies
+# Install minimal dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
-        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
-            mkdir -p -m 0755 /etc/apt/keyrings ; \
-            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
-            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
-            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
-        fi ; \
-    else \
-        for i in 1 2 3; do \
-            add-apt-repository -y ppa:deadsnakes/ppa && break || \
-            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-        done ; \
-    fi \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -319,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install uv and Python
+COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
+RUN uv python install ${PYTHON_VERSION} --default --verbose

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -555,5 +525,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]

 FROM vllm-openai-base AS vllm-openai

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -314,4 +314,4 @@ WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -309,4 +309,4 @@ USER 2000
 WORKDIR /home/vllm

 # Set the default entrypoint
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
 Start vLLM:

 ```bash
-vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"image": 1}' \
  --allowed-local-media-path /path/to/sharegpt4v/images
@ -687,7 +688,8 @@ vllm bench serve \
 Start vLLM:

 ```bash
-vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"video": 1}' \
  --allowed-local-media-path /path/to/sharegpt4video/videos
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -258,21 +258,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
        ) -> MultiModalDataDict:
            num_images = mm_counts.get("image", 0)

            target_width, target_height = \
                self.info.get_image_size_with_most_features()

-            image_overrides = mm_options.get("image") if mm_options else None
-
            return {
                "image":
                self._get_dummy_images(width=target_width,
                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                                    num_images=num_images)
            }
        ```

@ -442,20 +438,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
        ) -> MultiModalDataDict:
            target_width, target_height = \
                self.info.get_image_size_with_most_features()
            num_images = mm_counts.get("image", 0)

-            image_overrides = mm_options.get("image") if mm_options else None
-
            return {
                "image":
                self._get_dummy_images(width=target_width,
                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                                    num_images=num_images)
            }
        ```

--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example

 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Meta-Llama-3-70B
+    python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3-70B
 ```

 vllm bench command:
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
 1. Start the vLLM server with the supported chat completion model, e.g.

    ```bash
-    vllm serve mistralai/Mistral-7B-Instruct-v0.2
+    python -m vllm.entrypoints.openai.api_server \
+        --model mistralai/Mistral-7B-Instruct-v0.2
    ```

 1. Call it with AutoGen:
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
        For example:

        ```console
-        vllm serve <model> --host 0.0.0.0 --port 8000
+        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
        ```

 3. Start the Open WebUI Docker container:
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log &
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
      ports: 8081  # Expose to internet traffic.

    envs:
-      PYTHONUNBUFFERED: 1
      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
-      vllm serve $MODEL_NAME \
+      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
+        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
 python -m vllm.entrypoints.openai.api_server --model <model>
 ```

-!!! warning
-
-    `python -m vllm.entrypoints.openai.api_server` is deprecated
-    and may become unsupported in a future release.
-
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.

 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -8,9 +8,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup

 !!! tip
    When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
-
-    Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
-
    This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.

 ## Offline Inference
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
 When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.

 ```bash
-VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
+VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen3-0.6B \
  --enable-sleep-mode \
  --port 8000
 ```
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
 To perform the same with an online mode launch the server:

 ```bash
-vllm serve facebook/opt-6.7b \
+python -m vllm.entrypoints.openai.api_server \
    --host 0.0.0.0 \
    --port 8000 \
+    --model facebook/opt-6.7b \
    --seed 42 \
    -tp 1 \
    --gpu_memory_utilization 0.8 \
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@ -67,7 +67,8 @@ docker run -it \
 XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:

 ```bash
-vllm serve facebook/opt-13b \
+python -m vllm.entrypoints.openai.api_server \
+     --model=facebook/opt-13b \
     --dtype=bfloat16 \
     --max_model_len=1024 \
     --distributed-executor-backend=mp \
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]

 ### Transformers

-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".

 Currently, the Transformers backend works for the following:

 - Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only, mixture-of-experts
+- Architectures: encoder-only, decoder-only
 - Attention types: full attention and/or sliding attention

 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
@ -31,7 +31,6 @@ If the Transformers model implementation follows all the steps in [writing a cus

 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
-    - Data parallel
    - Pipeline parallel
    - Tensor parallel

@ -677,7 +676,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
-| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -66,9 +66,6 @@ Restrict domains that vLLM can access for media URLs by setting
 `--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
 (e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)

-Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
-redirects from being followed to bypass domain restrictions.
-
 ## Security and Firewalls: Protecting Exposed vLLM Systems

 While vLLM is designed to allow unsafe network services to be isolated to
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:

 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "internlm/Intern-S1-mini"
+    model_name = "internlm/Intern-S1"

    engine_args = EngineArgs(
        model=model_name,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:


 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "internlm/Intern-S1-mini"
+    model_name = "internlm/Intern-S1"

    engine_args = EngineArgs(
        model=model_name,
--- a/examples/online_serving/sagemaker-entrypoint.sh
+++ b/examples/online_serving/sagemaker-entrypoint.sh
@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
 done < <(env | grep "^${PREFIX}")

 # Pass the collected arguments to the main entrypoint
-exec vllm serve "${ARGS[@]}"
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.5.1 # Required for disaggregated prefill
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -11,8 +11,8 @@ import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
+from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         PassConfig)
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -8,11 +8,11 @@ import torch._dynamo

 from tests.compile.backend import LazyInitPass, TestBackend
 from tests.models.utils import check_outputs_equal
-from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -756,7 +756,7 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: Union[list[str], list[torch.Tensor], list[int]],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -86,16 +86,3 @@ def test_max_model_len():
        # It can be less if generation finishes due to other reasons (e.g., EOS)
        # before reaching the absolute model length limit.
        assert num_total_tokens <= max_model_len
-
-
-def test_log_stats():
-    llm = LLM(
-        model=MODEL_NAME,
-        disable_log_stats=False,
-        gpu_memory_utilization=0.10,
-        enforce_eager=True,  # reduce test time
-    )
-    outputs = llm.generate(PROMPTS, sampling_params=None)
-
-    # disable_log_stats is False, every output should have metrics
-    assert all(output.metrics is not None for output in outputs)
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -122,9 +122,6 @@ def mock_serving_setup():
                                                 models,
                                                 request_logger=None)

-    serving_completion._process_inputs = AsyncMock(return_value=(MagicMock(
-        name="engine_request"), {}))
-
    return mock_engine, serving_completion


--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -698,22 +698,6 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
        )


-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_system_message_with_tools(client: OpenAI, model_name: str):
-    from vllm.entrypoints.harmony_utils import get_system_message
-
-    # Test with custom tools enabled - commentary channel should be available
-    sys_msg = get_system_message(with_custom_tools=True)
-    valid_channels = sys_msg.content[0].channel_config.valid_channels
-    assert "commentary" in valid_channels
-
-    # Test with custom tools disabled - commentary channel should be removed
-    sys_msg = get_system_message(with_custom_tools=False)
-    valid_channels = sys_msg.content[0].channel_config.valid_channels
-    assert "commentary" not in valid_channels
-
-
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_full_history(client: OpenAI, model_name: str):
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -7,7 +7,7 @@ import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import MagicMock

 import pytest
 import pytest_asyncio
@ -230,7 +230,6 @@ class MockHFConfig:
@dataclass
 class MockModelConfig:
    task = "generate"
-    runner_type = "generate"
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
@ -245,33 +244,11 @@ class MockModelConfig:
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    skip_tokenizer_init = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


-def _build_serving_chat(engine: AsyncLLM,
-                        model_config: MockModelConfig) -> OpenAIServingChat:
-    models = OpenAIServingModels(engine_client=engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=model_config)
-    serving_chat = OpenAIServingChat(engine,
-                                     model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
-
-    async def _fake_process_inputs(request_id, engine_prompt, sampling_params,
-                                   *, lora_request, trace_headers, priority):
-        return dict(engine_prompt), {}
-
-    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
-    return serving_chat
-
-
@dataclass
 class MockEngine:

@ -305,7 +282,16 @@ async def test_serving_chat_returns_correct_model_name():
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
    messages = [{"role": "user", "content": "what is 1+1?"}]

    async def return_model_name(*args):
@ -332,7 +318,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    req = ChatCompletionRequest(
        model=MODEL_NAME,
@ -366,7 +361,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test Case 1: No max_tokens specified in request
    req = ChatCompletionRequest(
@ -411,7 +415,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test case 1: No max_tokens specified, defaults to context_window
    req = ChatCompletionRequest(
@ -458,7 +471,16 @@ async def test_serving_chat_could_load_correct_generation_config():
    mock_engine.errored = False

    # Initialize the serving chat
-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    req = ChatCompletionRequest(
        model=MODEL_NAME,
@ -503,7 +525,17 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    serving_chat = _build_serving_chat(mock_engine, mock_model_config)
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)

    # Test cache_salt
    req = ChatCompletionRequest(
@ -517,12 +549,10 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    # By default, cache_salt in the engine prompt is not set
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
-    assert "cache_salt" not in engine_prompt
+    assert "cache_salt" not in mock_engine.generate.call_args.args[0]

    # Test with certain cache_salt
    req.cache_salt = "test_salt"
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
-    assert engine_prompt.get("cache_salt") == "test_salt"
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@ -1,129 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from contextlib import AsyncExitStack
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-import pytest_asyncio
-
-from vllm.entrypoints.context import ConversationContext
-from vllm.entrypoints.openai.protocol import ResponsesRequest
-from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.tool_server import ToolServer
-
-
-class MockConversationContext(ConversationContext):
-    """Mock conversation context for testing"""
-
-    def __init__(self):
-        self.init_tool_sessions_called = False
-        self.init_tool_sessions_args = None
-        self.init_tool_sessions_kwargs = None
-
-    def append_output(self, output) -> None:
-        pass
-
-    async def call_tool(self):
-        return []
-
-    def need_builtin_tool_call(self) -> bool:
-        return False
-
-    def render_for_completion(self):
-        return []
-
-    async def init_tool_sessions(self, tool_server, exit_stack, request_id,
-                                 mcp_tools):
-        self.init_tool_sessions_called = True
-        self.init_tool_sessions_args = (tool_server, exit_stack, request_id,
-                                        mcp_tools)
-
-    async def cleanup_session(self) -> None:
-        pass
-
-
-@pytest.fixture
-def mock_serving_responses():
-    """Create a mock OpenAIServingResponses instance"""
-    serving_responses = MagicMock(spec=OpenAIServingResponses)
-    serving_responses.tool_server = MagicMock(spec=ToolServer)
-    return serving_responses
-
-
-@pytest.fixture
-def mock_context():
-    """Create a mock conversation context"""
-    return MockConversationContext()
-
-
-@pytest.fixture
-def mock_exit_stack():
-    """Create a mock async exit stack"""
-    return MagicMock(spec=AsyncExitStack)
-
-
-class TestInitializeToolSessions:
-    """Test class for _initialize_tool_sessions method"""
-
-    @pytest_asyncio.fixture
-    async def serving_responses_instance(self):
-        """Create a real OpenAIServingResponses instance for testing"""
-        # Create minimal mocks for required dependencies
-        engine_client = MagicMock()
-        engine_client.get_model_config = AsyncMock()
-
-        model_config = MagicMock()
-        model_config.hf_config.model_type = "test"
-        model_config.get_diff_sampling_param.return_value = {}
-
-        models = MagicMock()
-
-        tool_server = MagicMock(spec=ToolServer)
-
-        # Create the actual instance
-        instance = OpenAIServingResponses(
-            engine_client=engine_client,
-            model_config=model_config,
-            models=models,
-            request_logger=None,
-            chat_template=None,
-            chat_template_content_format="auto",
-            tool_server=tool_server,
-        )
-
-        return instance
-
-    @pytest.mark.asyncio
-    async def test_initialize_tool_sessions(self, serving_responses_instance,
-                                            mock_context, mock_exit_stack):
-        """Test that method works correctly with only MCP tools"""
-
-        request = ResponsesRequest(input="test input", tools=[])
-
-        # Call the method
-        await serving_responses_instance._initialize_tool_sessions(
-            request, mock_context, mock_exit_stack)
-        assert mock_context.init_tool_sessions_called is False
-
-        # Create only MCP tools
-        tools = [
-            {
-                "type": "web_search_preview"
-            },
-            {
-                "type": "code_interpreter",
-                "container": {
-                    "type": "auto"
-                }
-            },
-        ]
-
-        request = ResponsesRequest(input="test input", tools=tools)
-
-        # Call the method
-        await serving_responses_instance._initialize_tool_sessions(
-            request, mock_context, mock_exit_stack)
-
-        # Verify that init_tool_sessions was called
-        assert mock_context.init_tool_sessions_called
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@ -10,9 +10,8 @@ from unittest.mock import patch
 import pytest
 import torch

-from vllm.attention.backends.registry import _Backend
 from vllm.attention.layer import MultiHeadAttention
-from vllm.attention.selector import _cached_get_attn_backend
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
+    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (fp8_gemm_nt,
@ -91,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):

    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
                                       out_dtype)
-    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
-                                      out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)

    rel_diff = (torch.mean(
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
        (8, 513, 64),  # Non-divisible (native only)
    ])
@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
 def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
-                                      group_size: int, seed: int,
-                                      use_ue8m0: bool) -> None:
+                                      group_size: int, seed: int) -> None:
    """Test QuantFP8 group quantization with various configurations.

    Tests both CUDA and native implementations, column-major scales,
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    # 1. Test native implementation (always available)
    x_quant_native, scales_native = quant_op.forward_native(x.clone())
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
    # 2. Test column-major scales configuration
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x.clone())
-    assert scales_col.shape == (batch_size, expected_num_groups)
-    assert scales_col.stride(0) == 1
-    assert scales_col.stride(1) == batch_size
-
-    # Test column-major scales consistency
-    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
+    assert scales_col.shape == (expected_num_groups, batch_size)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@ -77,23 +68,21 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,


@pytest.mark.parametrize("seed", [42])
-@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+def test_quantfp8_group_multidimensional(seed: int) -> None:
    current_platform.seed_everything(seed)

    group_size = 64

    # Test with 3D input
-    batch1, batch2, hidden_dim = 4, 8, 1024
+    batch1, batch2, hidden_dim = 4, 8, 512
    x_3d = torch.randn(
        (batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8

    group_shape = GroupShape(1, group_size)
    quant_op = QuantFP8(static=False,
                        group_shape=group_shape,
-                        column_major_scales=False,
-                        use_ue8m0=use_ue8m0)
+                        column_major_scales=False)

    x_quant, scales = quant_op.forward_native(x_3d.clone())
    assert x_quant.shape == x_3d.shape
@ -102,10 +91,9 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
    # Test column_major_scales with multi-dim
    quant_op_col = QuantFP8(static=False,
                            group_shape=group_shape,
-                            column_major_scales=True,
-                            use_ue8m0=use_ue8m0)
+                            column_major_scales=True)
    _, scales_col = quant_op_col.forward_native(x_3d.clone())
-    assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
+    assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)

    # Test with 4D input
    batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
--- a/Show More
+++ b/Show More