AITER MHA off by default

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
update base image for RC
2025-10-22 14:49:01 -07:00 · 2025-10-22 14:49:01 -07:00 · 2025-10-22 14:49:00 -07:00
3 changed files with 24 additions and 9 deletions
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -1,7 +1,7 @@
 # default base image
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
-ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008

 FROM ${BASE_IMAGE} AS base

--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -290,7 +290,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode | None = CUDAGraphMode.FULL
    """
    The mode of the cudagraph:

@ -521,6 +521,16 @@ class CompilationConfig:
        count_all = self.custom_ops.count("all")
        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"

+        if "+rms_norm" not in self.custom_ops and "-rms_norm" not in self.custom_ops:
+            self.custom_ops.append("+rms_norm")
+        if (
+            "+silu_and_mul" not in self.custom_ops
+            and "-silu_and_mul" not in self.custom_ops
+        ):
+            self.custom_ops.append("+silu_and_mul")
+        if "+quant_fp8" not in self.custom_ops and "-quant_fp8" not in self.custom_ops:
+            self.custom_ops.append("+quant_fp8")
+
        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
        # 1. A bug in PyTorch, fixed in 2.7:
        #    https://github.com/pytorch/pytorch/issues/147924
@ -752,7 +762,12 @@ class CompilationConfig:
            # captured. see https://github.com/vllm-project/vllm/pull/20059
            # for details. Make a copy to avoid mutating the class-level
            # list via reference.
-            self.splitting_ops = list(self._attention_ops)
+            self.splitting_ops = (
+                []
+                if self.cudagraph_mode == CUDAGraphMode.FULL
+                else list(self._attention_ops)
+            )
+
        elif len(self.splitting_ops) == 0:
            logger.warning_once("Using piecewise compilation with empty splitting_ops")
            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -19,7 +19,7 @@ if TYPE_CHECKING:
    VLLM_NCCL_SO_PATH: str | None = None
    LD_LIBRARY_PATH: str | None = None
    VLLM_USE_TRITON_FLASH_ATTN: bool = True
-    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = True
    VLLM_FLASH_ATTN_VERSION: int | None = None
    LOCAL_RANK: int = 0
    CUDA_VISIBLE_DEVICES: str | None = None
@ -99,13 +99,13 @@ if TYPE_CHECKING:
    VLLM_DISABLED_KERNELS: list[str] = []
    VLLM_DISABLE_PYNCCL: bool = False
    VLLM_USE_V1: bool = True
-    VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER: bool = True
    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
    VLLM_ROCM_USE_AITER_LINEAR: bool = True
    VLLM_ROCM_USE_AITER_MOE: bool = True
    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
    VLLM_ROCM_USE_AITER_MLA: bool = True
-    VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_ROCM_USE_AITER_MHA: bool = False
    VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
    VLLM_ROCM_USE_TRITON_ROPE: bool = False
    VLLM_ROCM_USE_AITER_FP8BMM: bool = True
@ -485,7 +485,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Use separate prefill and decode kernels for V1 attention instead of
    # the unified triton kernel.
    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: (
-        os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
+        os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "True").lower()
        in ("true", "1")
    ),
    # Force vllm to use a specific flash-attention version (2 or 3), only valid
@ -832,7 +832,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Disable aiter ops unless specifically enabled.
    # Acts as a parent switch to enable the rest of the other operations.
    "VLLM_ROCM_USE_AITER": lambda: (
-        os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1")
+        os.getenv("VLLM_ROCM_USE_AITER", "True").lower() in ("true", "1")
    ),
    # Whether to use aiter paged attention.
    # By default is disabled.
@ -862,7 +862,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Whether to use aiter mha ops.
    # By default is enabled.
    "VLLM_ROCM_USE_AITER_MHA": lambda: (
-        os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in ("true", "1")
+        os.getenv("VLLM_ROCM_USE_AITER_MHA", "False").lower() in ("true", "1")
    ),
    # Whether to use aiter fp4 gemm asm.
    # By default is disabled.
Author	SHA1	Message	Date
Micah Williamson	c7021f1270	AITER MHA off by default Signed-off-by: Micah Williamson <micah.williamson@amd.com>	2025-10-22 14:49:01 -07:00
Micah Williamson	2072fdc044	update base image for RC Signed-off-by: Micah Williamson <micah.williamson@amd.com>	2025-10-22 14:49:01 -07:00
Micah Williamson	6eefda507a	RC specific config changes and docker changes Signed-off-by: Micah Williamson <micah.williamson@amd.com>	2025-10-22 14:49:00 -07:00