diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 45efcbde69..dc8ec5f1a1 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="c1debd8" +ARG AITER_BRANCH="6487649" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index d44989cce7..00f1b1f6b9 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -22,8 +22,9 @@ class QuantMethod(IntEnum): NO = 0 # a16w16 PER_TENSOR = 1 # w8a8 (pre_Tensor) PER_TOKEN = 2 # w8a8/w8a4 (per_Token) - BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) - BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) + BLOCK_1X32 = 3 # fp4x2 + BLOCK_1X128 = 4 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 5 # block quantized w8a8 (per_128x128) class ActivationMethod(IntEnum):