Compare commits
3 Commits
revert-267
...
amd_dev
| Author | SHA1 | Date | |
|---|---|---|---|
| c7021f1270 | |||
| 2072fdc044 | |||
| 6eefda507a |
@ -1,7 +1,7 @@
|
||||
# default base image
|
||||
ARG REMOTE_VLLM="0"
|
||||
ARG COMMON_WORKDIR=/app
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
|
||||
@ -290,7 +290,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode | None = None
|
||||
cudagraph_mode: CUDAGraphMode | None = CUDAGraphMode.FULL
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@ -521,6 +521,16 @@ class CompilationConfig:
|
||||
count_all = self.custom_ops.count("all")
|
||||
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
|
||||
|
||||
if "+rms_norm" not in self.custom_ops and "-rms_norm" not in self.custom_ops:
|
||||
self.custom_ops.append("+rms_norm")
|
||||
if (
|
||||
"+silu_and_mul" not in self.custom_ops
|
||||
and "-silu_and_mul" not in self.custom_ops
|
||||
):
|
||||
self.custom_ops.append("+silu_and_mul")
|
||||
if "+quant_fp8" not in self.custom_ops and "-quant_fp8" not in self.custom_ops:
|
||||
self.custom_ops.append("+quant_fp8")
|
||||
|
||||
# TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
|
||||
# 1. A bug in PyTorch, fixed in 2.7:
|
||||
# https://github.com/pytorch/pytorch/issues/147924
|
||||
@ -752,7 +762,12 @@ class CompilationConfig:
|
||||
# captured. see https://github.com/vllm-project/vllm/pull/20059
|
||||
# for details. Make a copy to avoid mutating the class-level
|
||||
# list via reference.
|
||||
self.splitting_ops = list(self._attention_ops)
|
||||
self.splitting_ops = (
|
||||
[]
|
||||
if self.cudagraph_mode == CUDAGraphMode.FULL
|
||||
else list(self._attention_ops)
|
||||
)
|
||||
|
||||
elif len(self.splitting_ops) == 0:
|
||||
logger.warning_once("Using piecewise compilation with empty splitting_ops")
|
||||
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
|
||||
12
vllm/envs.py
12
vllm/envs.py
@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
||||
VLLM_NCCL_SO_PATH: str | None = None
|
||||
LD_LIBRARY_PATH: str | None = None
|
||||
VLLM_USE_TRITON_FLASH_ATTN: bool = True
|
||||
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
|
||||
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = True
|
||||
VLLM_FLASH_ATTN_VERSION: int | None = None
|
||||
LOCAL_RANK: int = 0
|
||||
CUDA_VISIBLE_DEVICES: str | None = None
|
||||
@ -99,13 +99,13 @@ if TYPE_CHECKING:
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
VLLM_DISABLE_PYNCCL: bool = False
|
||||
VLLM_USE_V1: bool = True
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
VLLM_ROCM_USE_AITER: bool = True
|
||||
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
|
||||
VLLM_ROCM_USE_AITER_LINEAR: bool = True
|
||||
VLLM_ROCM_USE_AITER_MOE: bool = True
|
||||
VLLM_ROCM_USE_AITER_RMSNORM: bool = True
|
||||
VLLM_ROCM_USE_AITER_MLA: bool = True
|
||||
VLLM_ROCM_USE_AITER_MHA: bool = True
|
||||
VLLM_ROCM_USE_AITER_MHA: bool = False
|
||||
VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
|
||||
VLLM_ROCM_USE_TRITON_ROPE: bool = False
|
||||
VLLM_ROCM_USE_AITER_FP8BMM: bool = True
|
||||
@ -485,7 +485,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Use separate prefill and decode kernels for V1 attention instead of
|
||||
# the unified triton kernel.
|
||||
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: (
|
||||
os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
|
||||
os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "True").lower()
|
||||
in ("true", "1")
|
||||
),
|
||||
# Force vllm to use a specific flash-attention version (2 or 3), only valid
|
||||
@ -832,7 +832,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Disable aiter ops unless specifically enabled.
|
||||
# Acts as a parent switch to enable the rest of the other operations.
|
||||
"VLLM_ROCM_USE_AITER": lambda: (
|
||||
os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1")
|
||||
os.getenv("VLLM_ROCM_USE_AITER", "True").lower() in ("true", "1")
|
||||
),
|
||||
# Whether to use aiter paged attention.
|
||||
# By default is disabled.
|
||||
@ -862,7 +862,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Whether to use aiter mha ops.
|
||||
# By default is enabled.
|
||||
"VLLM_ROCM_USE_AITER_MHA": lambda: (
|
||||
os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in ("true", "1")
|
||||
os.getenv("VLLM_ROCM_USE_AITER_MHA", "False").lower() in ("true", "1")
|
||||
),
|
||||
# Whether to use aiter fp4 gemm asm.
|
||||
# By default is disabled.
|
||||
|
||||
Reference in New Issue
Block a user