update 3.8 v2 (#2112)

* update 3.8 v2 * update 3.8 --------- Co-authored-by: yuzhai <yuzhai@nvidia.com>
2025-02-19 19:03:14 -08:00
parent e9627ce55b
commit b84e9802d8
166 changed files with 3986 additions and 4037 deletions
--- a/test/unit/cute/core/CMakeLists.txt
+++ b/test/unit/cute/core/CMakeLists.txt
@ -47,11 +47,9 @@ cutlass_test_unit_add_executable(
  math.cpp
  mixedbits.cpp
  nullspace.cpp
-  packed_tuple.cpp
  pointer.cpp
  reverse.cpp
  swizzle_layout.cpp
  transform.cpp
  tuple.cpp
-  tuple_find.cpp
 )
--- a/test/unit/cute/core/packed_tuple.cpp
+++ b/test/unit/cute/core/packed_tuple.cpp
@ -1,581 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass_unit_test.h"
-
-#include <cutlass/trace.h>
-
-#include <cassert>
-#include <cstdint>
-
-#include <tuple>
-#include <cute/container/tuple.hpp>
-#include <cute/container/packed_tuple.hpp>
-#include <cute/algorithm/tuple_algorithms.hpp>
-#include <cute/tensor.hpp>
-
-namespace pt_test {
-
-template <class T>
-struct Nonempty {
-  T datum;
-
-  Nonempty(T const& t) : datum{t} {}
-
-  friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
-    return lhs.datum == rhs.datum;
-  }
-
-  friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-template <int V>
-struct Empty {
-  template <int W>
-  friend bool operator==(Empty<V> const&, Empty<W> const&) {
-    return V == W;
-  }
-
-  template <int W>
-  friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-// std::tuple
-static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
-static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
-static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
-static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
-
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-// cute::tuple
-static_assert(cute::is_standard_layout_v<cute::tuple<>>); // it happens to be
-static_assert(cute::is_standard_layout_v<cute::tuple<int>>); // it happens to be
-static_assert(cute::is_standard_layout_v<cute::tuple<double>>); // it happens to be
-static_assert(not cute::is_standard_layout_v<cute::tuple<int, double>>); // it's not
-#endif // CUTLASS_USE_PACKED_TUPLE
-
-// cute::packed_tuple
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<double>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, double>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, int, int, int>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<int, int>, int>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<Empty<0>, Empty<0>>, int>>);  // it is
-
-//////////////////////////////////////////////////////////////////////
-// packed_tuple test starts here
-//////////////////////////////////////////////////////////////////////
-
-template <
-  class ExpectedPackedType,
-  size_t ExpectedPackedSize,
-  class ... Args>
-constexpr void
-test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
-{
-  using cute::packed_tuple;
-
-  if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
-    static_assert(cute::is_standard_layout_v<packed_tuple<Args...>>);
-  }
-
-  if constexpr ((cute::is_empty_v<Args> && ...)) {
-    static_assert(cute::is_empty_v<packed_tuple<Args...>>);
-  }
-
-  static_assert(cute::tuple_size_v<packed_tuple<Args...>> == sizeof...(Args));
-
-  auto test_element = [unpacked] (auto index) {
-    static_assert(cute::is_same_v<
-      std::tuple_element_t<index, packed_tuple<Args...>>,
-      std::tuple_element_t<index, std::tuple<Args...>>
-    >);
-
-    packed_tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_packed_tuple(a...); });
-    EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
-  };
-  cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
-}
-
-void test_packed_type_aliases() {
-  using cute::packed_tuple;
-  test_packed_type_alias<packed_tuple<>, 0>({}, {});
-
-  test_packed_type_alias<packed_tuple<int>, 1, int>({7}, {7});
-  test_packed_type_alias<packed_tuple<double>, 1, double>({1.5}, {1.5});
-
-  // Make sure that class types are handled the same as scalar types
-  test_packed_type_alias<packed_tuple<Nonempty<int>>, 1, Nonempty<int>>(
-    {Nonempty{7}}, {Nonempty{7}});
-  test_packed_type_alias<packed_tuple<Nonempty<double>>, 1, Nonempty<double>>(
-    {Nonempty{1.5}}, {Nonempty{1.5}});
-
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>>({}, {});
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>>(
-    {}, {Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
-    {}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
-
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int>(
-    {7}, {Empty<0>{}, 7});
-  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>>(
-    {7}, {7, Empty<0>{}});
-
-  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>, Empty<1>>(
-    {7}, {7, Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int, Empty<1>>(
-    {7}, {Empty<0>{}, 7, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, Empty<1>, int>(
-    {7}, {Empty<0>{}, Empty<1>{}, 7});
-
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double>(
-    {7, 1.5}, {7, Empty<0>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}});
-
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
-    {7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
-    {7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
-    {7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
-    {7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
-
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, Empty<0>, int, double, float>(
-    {7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, Empty<0>, double, float>(
-    {7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, Empty<0>, float>(
-    {7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, float, Empty<0>>(
-    {7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
-}
-
-template <class Tuple, size_t Which, class ExpectedElementType>
-constexpr bool test_tuple_element() {
-  return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
-}
-
-void test_tuple_elements() {
-  using cute::packed_tuple;
-
-  static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
-  static_assert(test_tuple_element<packed_tuple<Empty<0>>, 0, Empty<0>>());
-}
-
-// A default-constructible type.
-template <size_t Value>
-struct DefaultConstructible {};
-
-void test_default_constructibility() {
-  using cute::packed_tuple;
-  {
-    [[maybe_unused]] packed_tuple<> t_p_0;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>> t_p_1;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
-  }
-}
-
-void test_sizes_and_not_storing_empty_types() {
-  using cute::packed_tuple;
-
-  [[maybe_unused]] packed_tuple<
-    int,
-    pt_test::Empty<0>,
-    double
-  > pt{42, pt_test::Empty<0>{}, 1.5};
-  static_assert(cute::is_standard_layout_v<decltype(pt)>);
-  // packed_result_type must only store the packed tuple,
-  // and not the integer_sequence(s) used to access it.
-  // The latter can be represented entirely at compile time as types.
-  struct { int i; double j; } IntDouble;
-  static_assert(sizeof(pt) == sizeof(IntDouble));
-
-  EXPECT_EQ(cute::get<0>(pt), 42);
-  EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
-  EXPECT_EQ(cute::get<2>(pt), 1.5);
-  packed_tuple<
-    pt_test::Empty<0>,
-    pt_test::Empty<1>,
-    packed_tuple<
-      pt_test::Empty<0>,
-      pt_test::Empty<1>,
-      packed_tuple<pt_test::Empty<0>, packed_tuple<>>
-    >
-  > pt_empty{};
-  static_assert(cute::is_empty_v<decltype(pt_empty)>);
-  static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
-  static_assert(sizeof(pt_empty) == 1);
-
-  // Template arguments must be default constructible,
-  // and packed_tuple itself needs a default constructor.
-  [[maybe_unused]] packed_tuple<
-    packed_tuple<int, pt_test::Empty<2>>,
-    double,
-    pt_test::Empty<3>> pt2;
-  static_assert(cute::is_standard_layout_v<decltype(pt2)>);
-
-  // cute::packed_tuple, like the original cute::tuple, does not
-  // promise to have working CTAD (constructor template argument
-  // deduction).
-  [[maybe_unused]] packed_tuple<
-    packed_tuple<int, pt_test::Empty<0>>,
-    pt_test::Empty<1>
-  > pt3{
-    packed_tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
-    pt_test::Empty<1>{}
-  };
-  static_assert(cute::is_standard_layout_v<decltype(pt3)>);
-  static_assert(cute::is_same_v<
-    cute::tuple_element_t<0, decltype(pt3)>,
-    packed_tuple<int, pt_test::Empty<0>>>);
-  static_assert(cute::is_same_v<
-    cute::tuple_element_t<1, decltype(pt3)>,
-    pt_test::Empty<1>>);
-  static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
-
-  packed_tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
-  auto pt3_0_1 = cute::get<1>(pt3_0);
-  static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
-
-  EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
-  EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
-}
-
-} // namespace test
-
-TEST(CuTe_core, PackedTuple2)
-{
-  CUTLASS_TRACE_HOST("-------------------------------");
-  CUTLASS_TRACE_HOST("packed_tuple");
-  CUTLASS_TRACE_HOST("-------------------------------");
-
-  pt_test::test_packed_type_aliases();
-  pt_test::test_tuple_elements();
-  pt_test::test_default_constructibility();
-  pt_test::test_sizes_and_not_storing_empty_types();
-}
-
-TEST(CuTe_core, PackedTuple2Get) {
-  using cute::packed_tuple;
-  using pt_test::Empty;
-  using pt_test::Nonempty;
-
-  {
-    using tuple_type = packed_tuple<int>;
-    tuple_type pt{42};
-    static_assert(cute::tuple_size_v<tuple_type> == 1u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<0>(pt) = 43;
-    EXPECT_EQ(cute::get<0>(pt), 43);
-  }
-  {
-    using tuple_type = packed_tuple<int>;
-    tuple_type const pt{42};
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
-  }
-  {
-    EXPECT_EQ(cute::get<0>(packed_tuple<int>{42}), 42);
-  }
-
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    tuple_type pt;
-    static_assert(cute::tuple_size_v<tuple_type> == 1u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
-    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    tuple_type const pt;
-    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
-  }
-
-  {
-    using tuple_type = packed_tuple<int, double>;
-    tuple_type pt{1, 2.5};
-    static_assert(cute::tuple_size_v<tuple_type> == 2u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    cute::get<0>(pt) = 2;
-    EXPECT_EQ(cute::get<0>(pt), 2);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    cute::get<1>(pt) = 3.5;
-    EXPECT_EQ(cute::get<1>(pt), 3.5);
-  }
-  {
-    using tuple_type = packed_tuple<int, double>;
-    tuple_type const pt{1, 2.5};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
-  }
-  {
-    using tuple_type = packed_tuple<int, double>;
-    EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
-    EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
-  }
-
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    tuple_type pt{Empty<0>{}, 2.5};
-    static_assert(cute::tuple_size_v<tuple_type> == 2u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    cute::get<1>(pt) = 3.5;
-    EXPECT_EQ(cute::get<1>(pt), 3.5);
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    tuple_type const pt{Empty<0>{}, 2.5};
-    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
-  }
-
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    tuple_type pt{1, 2.5, Nonempty{3.25f}};
-    static_assert(cute::tuple_size_v<tuple_type> == 3u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-
-    cute::get<0>(pt) = 42;
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<1>(pt) = 4.5;
-    EXPECT_EQ(cute::get<1>(pt), 4.5);
-    cute::get<2>(pt) = Nonempty<float>{3.75f};
-    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
-  }
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    tuple_type const pt{1, 2.5, Nonempty{3.25f}};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-  }
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
-    EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
-    EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
-  }
-
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    packed_tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
-    static_assert(cute::tuple_size_v<tuple_type> == 3u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-
-    cute::get<0>(pt) = 42;
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<2>(pt) = Nonempty<float>{3.75f};
-    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
-  }
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-  }
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
-    EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
-    EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
-  }
-}
-
-namespace pt_test {
-
-// An empty class type to which Empty is convertible.
-template<int Value>
-struct ConvertibleFromEmpty {
-  constexpr ConvertibleFromEmpty() = default;
-  constexpr ConvertibleFromEmpty(Empty<Value>) {}
-
-  template <int OtherValue>
-  friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
-    return Value == OtherValue;
-  }
-
-  template <int OtherValue>
-  friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-} // end namespace pt_test
-
-TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
-  // Make sure that packed_tuple's default constructor is constexpr.
-  // MSVC makes this a bit more challenging than usual.
-
-  using pt_test::Empty;
-  {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
-  }
-  {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
-  }
-}
-
-TEST(CuTe_core, PackedTupleConvertingConstruction) {
-  using cute::packed_tuple;
-  using pt_test::ConvertibleFromEmpty;
-  using pt_test::Empty;
-  using pt_test::Nonempty;  
-
-  {
-    using tuple_type = cute::tuple<Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(7);
-    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(7);
-    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{});
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{});
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-  }
-
-  {
-    using tuple_type = cute::tuple<float, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(1.5f, 7);
-    EXPECT_EQ(cute::get<0>(t), 1.5f);
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<float, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(1.5f, 7);
-    EXPECT_EQ(cute::get<0>(t), 1.5f);
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using inner_tuple_type = cute::tuple<Empty<0>>;
-    using outer_tuple_type = cute::tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = packed_tuple<Empty<0>>;
-    using outer_tuple_type = packed_tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
-    using outer_tuple_type = cute::tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
-    using outer_tuple_type = packed_tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-
-}
-
-
--- a/test/unit/cute/core/tuple.cpp
+++ b/test/unit/cute/core/tuple.cpp
@ -32,6 +32,13 @@
 #include "cutlass_unit_test.h"

 #include <cutlass/trace.h>
+
+#include <cassert>
+#include <cstdint>
+
+#include <tuple>
+#include <cute/container/tuple.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
 #include <cute/tensor.hpp>

 TEST(CuTe_core, Tuple)
@ -120,6 +127,11 @@ TEST(CuTe_core, Tuple)
  ASSERT_TRUE(sizeof(tuple_3h_m_type) == 12);
  ASSERT_TRUE(!std::is_empty<tuple_3h_m_type>::value);

+  ASSERT_TRUE(sizeof(cute::tuple<_1, _1, cute::tuple<int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, _0, cute::tuple<int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_1, int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_0, int32_t>>) == 4);
+
  CUTLASS_TRACE_HOST("-------------------------------");
  CUTLASS_TRACE_HOST("SIMPLE TUPLE OPS");
  CUTLASS_TRACE_HOST("-------------------------------");
@ -264,3 +276,588 @@ TEST(CuTe_core, Tuple)
    CUTLASS_TRACE_HOST("a(_,1,_,(1,2)) = " << dice(make_coord(_,1,_,make_coord(1,2)), a));
  }
 }
+
+namespace pt_test {
+
+template <class T>
+struct Nonempty {
+  T datum;
+
+  Nonempty(T const& t) : datum{t} {}
+
+  friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return lhs.datum == rhs.datum;
+  }
+
+  friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <int V>
+struct Empty {
+  template <int W>
+  friend bool operator==(Empty<V> const&, Empty<W> const&) {
+    return V == W;
+  }
+
+  template <int W>
+  friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+// std::tuple
+static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
+static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
+
+// cute::tuple
+static_assert(cute::is_standard_layout_v<cute::tuple<>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<int>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<double>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<int, double>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, int, int, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<int, int>, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<Empty<0>, Empty<0>>, int>>);  // it is
+
+//////////////////////////////////////////////////////////////////////
+// tuple test starts here
+//////////////////////////////////////////////////////////////////////
+
+template <
+  class ExpectedPackedType,
+  size_t ExpectedPackedSize,
+  class ... Args>
+constexpr void
+test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
+{
+  using cute::tuple;
+
+  if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
+    static_assert(cute::is_standard_layout_v<tuple<Args...>>);
+  }
+
+  if constexpr ((cute::is_empty_v<Args> && ...)) {
+    static_assert(cute::is_empty_v<tuple<Args...>>);
+  }
+
+  static_assert(cute::tuple_size_v<tuple<Args...>> == sizeof...(Args));
+
+  auto test_element = [unpacked] (auto index) {
+    static_assert(cute::is_same_v<
+      std::tuple_element_t<index, tuple<Args...>>,
+      std::tuple_element_t<index, std::tuple<Args...>>
+    >);
+
+    tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_tuple(a...); });
+    EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
+  };
+  cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
+}
+
+void test_packed_type_aliases() {
+  using cute::tuple;
+  test_packed_type_alias<tuple<>, 0>({}, {});
+
+  test_packed_type_alias<tuple<int>, 1, int>({7}, {7});
+  test_packed_type_alias<tuple<double>, 1, double>({1.5}, {1.5});
+
+  // Make sure that class types are handled the same as scalar types
+  test_packed_type_alias<tuple<Nonempty<int>>, 1, Nonempty<int>>(
+    {Nonempty{7}}, {Nonempty{7}});
+  test_packed_type_alias<tuple<Nonempty<double>>, 1, Nonempty<double>>(
+    {Nonempty{1.5}}, {Nonempty{1.5}});
+
+  test_packed_type_alias<tuple<>, 0, Empty<0>>({}, {});
+  test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>>(
+    {}, {Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
+    {}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
+
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, int>(
+    {7}, {Empty<0>{}, 7});
+  test_packed_type_alias<tuple<int>, 1, int, Empty<0>>(
+    {7}, {7, Empty<0>{}});
+
+  test_packed_type_alias<tuple<int>, 1, int, Empty<0>, Empty<1>>(
+    {7}, {7, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, int, Empty<1>>(
+    {7}, {Empty<0>{}, 7, Empty<1>{}});
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, Empty<1>, int>(
+    {7}, {Empty<0>{}, Empty<1>{}, 7});
+
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
+    {7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
+    {7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
+    {7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
+
+  test_packed_type_alias<tuple<int, double, float>, 3, Empty<0>, int, double, float>(
+    {7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, Empty<0>, double, float>(
+    {7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, double, Empty<0>, float>(
+    {7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, double, float, Empty<0>>(
+    {7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
+}
+
+template <class Tuple, size_t Which, class ExpectedElementType>
+constexpr bool test_tuple_element() {
+  return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
+}
+
+void test_tuple_elements() {
+  using cute::tuple;
+
+  static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
+  static_assert(test_tuple_element<tuple<Empty<0>>, 0, Empty<0>>());
+}
+
+// A default-constructible type.
+template <size_t Value>
+struct DefaultConstructible {};
+
+void test_default_constructibility() {
+  using cute::tuple;
+  {
+    [[maybe_unused]] tuple<> t_p_0;
+    [[maybe_unused]] tuple<DefaultConstructible<0>> t_p_1;
+    [[maybe_unused]] tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
+    [[maybe_unused]] tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
+  }
+}
+
+void test_sizes_and_not_storing_empty_types() {
+  using cute::tuple;
+
+  [[maybe_unused]] tuple<
+    int,
+    pt_test::Empty<0>,
+    double
+  > pt{42, pt_test::Empty<0>{}, 1.5};
+  static_assert(cute::is_standard_layout_v<decltype(pt)>);
+  // packed_result_type must only store the packed tuple,
+  // and not the integer_sequence(s) used to access it.
+  // The latter can be represented entirely at compile time as types.
+  struct { int i; double j; } IntDouble;
+  static_assert(sizeof(pt) == sizeof(IntDouble));
+
+  EXPECT_EQ(cute::get<0>(pt), 42);
+  EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
+  EXPECT_EQ(cute::get<2>(pt), 1.5);
+  tuple<
+    pt_test::Empty<0>,
+    pt_test::Empty<1>,
+    tuple<
+      pt_test::Empty<0>,
+      pt_test::Empty<1>,
+      tuple<pt_test::Empty<0>, tuple<>>
+    >
+  > pt_empty{};
+  static_assert(cute::is_empty_v<decltype(pt_empty)>);
+  static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
+  static_assert(sizeof(pt_empty) == 1);
+
+  // Template arguments must be default constructible,
+  // and tuple itself needs a default constructor.
+  [[maybe_unused]] tuple<
+    tuple<int, pt_test::Empty<2>>,
+    double,
+    pt_test::Empty<3>> pt2;
+  static_assert(cute::is_standard_layout_v<decltype(pt2)>);
+
+  // cute::tuple, like the original cute::tuple, does not
+  // promise to have working CTAD (constructor template argument
+  // deduction).
+  [[maybe_unused]] tuple<
+    tuple<int, pt_test::Empty<0>>,
+    pt_test::Empty<1>
+  > pt3{
+    tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
+    pt_test::Empty<1>{}
+  };
+  static_assert(cute::is_standard_layout_v<decltype(pt3)>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<0, decltype(pt3)>,
+    tuple<int, pt_test::Empty<0>>>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<1, decltype(pt3)>,
+    pt_test::Empty<1>>);
+  static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
+
+  tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
+  auto pt3_0_1 = cute::get<1>(pt3_0);
+  static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
+
+  EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
+  EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
+}
+
+} // namespace test
+
+TEST(CuTe_core, PackedTuple2)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("tuple");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  pt_test::test_packed_type_aliases();
+  pt_test::test_tuple_elements();
+  pt_test::test_default_constructibility();
+  pt_test::test_sizes_and_not_storing_empty_types();
+}
+
+TEST(CuTe_core, PackedTuple2Get) {
+  using cute::tuple;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  {
+    using tuple_type = tuple<int>;
+    tuple_type pt{42};
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<0>(pt) = 43;
+    EXPECT_EQ(cute::get<0>(pt), 43);
+  }
+  {
+    using tuple_type = tuple<int>;
+    tuple_type const pt{42};
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+  }
+  {
+    EXPECT_EQ(cute::get<0>(tuple<int>{42}), 42);
+  }
+
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    tuple_type pt;
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    tuple_type const pt;
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
+  }
+
+  {
+    using tuple_type = tuple<int, double>;
+    tuple_type pt{1, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    cute::get<0>(pt) = 2;
+    EXPECT_EQ(cute::get<0>(pt), 2);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = tuple<int, double>;
+    tuple_type const pt{1, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = tuple<int, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
+    EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    tuple_type pt{Empty<0>{}, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    tuple_type const pt{Empty<0>{}, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    tuple_type pt{1, 2.5, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<1>(pt) = 4.5;
+    EXPECT_EQ(cute::get<1>(pt), 4.5);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    tuple_type const pt{1, 2.5, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
+    EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
+    EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+}
+
+namespace pt_test {
+
+// An empty class type to which Empty is convertible.
+template <int Value>
+struct ConvertibleFromEmpty {
+  constexpr ConvertibleFromEmpty() = default;
+  constexpr ConvertibleFromEmpty(Empty<Value>) {}
+
+  template <int OtherValue>
+  friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
+    return Value == OtherValue;
+  }
+
+  template <int OtherValue>
+  friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+} // end namespace pt_test
+
+TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
+  // Make sure that tuple's default constructor is constexpr.
+  // MSVC makes this a bit more challenging than usual.
+
+  using pt_test::Empty;
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
+  }
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
+  }
+}
+
+TEST(CuTe_core, PackedTupleConvertingConstruction) {
+  using cute::tuple;
+  using pt_test::ConvertibleFromEmpty;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  {
+    using tuple_type = cute::tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+  {
+    using tuple_type = tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+
+  {
+    using tuple_type = cute::tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using inner_tuple_type = cute::tuple<Empty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = tuple<Empty<0>>;
+    using outer_tuple_type = tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+}
+
+namespace test {
+
+template <size_t ExpectedIndex, class X, class Tuple>
+void test_tuple_find(Tuple const& t) {
+  auto index = cute::find<X>(t);
+  static_assert(decltype(index)::value == ExpectedIndex);
+}
+
+template <template <class...> class Tuple>
+void test_tuple_find_all() {
+  using test::test_tuple_find;
+  using cute::_1;
+  using cute::_2;
+  using cute::_4;
+
+  test_tuple_find<0, _1>(Tuple<_1>{});
+  test_tuple_find<1, _2>(Tuple<_1>{});
+  test_tuple_find<0, int>(Tuple<int>{7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2>{});
+  test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<_1, _2>{});
+  test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
+}
+
+} // end namespace test
+
+TEST(CuTe_core, TupleFind)
+{
+  test::test_tuple_find_all<cute::tuple>();
+}
--- a/test/unit/cute/core/tuple_find.cpp
+++ b/test/unit/cute/core/tuple_find.cpp
@ -1,103 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass_unit_test.h"
-
-#include <cutlass/trace.h>
-#include <cute/container/packed_tuple.hpp>
-#include <cute/container/tuple.hpp>
-
-namespace test {
-
-template<size_t ExpectedIndex, class X, class Tuple>
-void test_tuple_find(Tuple const& t) {
-  auto index = cute::find<X>(t);
-  static_assert(decltype(index)::value == ExpectedIndex);
-}
-
-template<template<class...> class Tuple>
-void test_tuple_find_all() {
-  using test::test_tuple_find;
-  using cute::_1;
-  using cute::_2;
-  using cute::_4;
-
-  test_tuple_find<0, _1>(Tuple<_1>{});
-  test_tuple_find<0, int>(Tuple<int>{7});
-
-  test_tuple_find<0, _1>(Tuple<_1, _2>{});
-  test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
-  test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
-  test_tuple_find<1, _2>(Tuple<_1, _2>{});
-  test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
-  test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
-
-  test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
-
-  test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
-
-  test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
-}
-
-} // end namespace test
-
-
-TEST(CuTe_core, TupleFind)
-{
-  test::test_tuple_find_all<cute::tuple>();
-}
-
-// If cute::tuple is not simply an alias for cute::packed_tuple,
-// then test cute::packed_tuple separately.
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-TEST(CuTe_core, PackedTupleFind)
-{
-  test::test_tuple_find_all<cute::packed_tuple>();
-}
-#endif // CUTLASS_USE_PACKED_TUPLE
--- a/test/unit/cute/msvc_compilation/tuple.cpp
+++ b/test/unit/cute/msvc_compilation/tuple.cpp
@ -53,8 +53,6 @@ private:
 template<class Integral, Integral Value>
 using IC = std::integral_constant<Integral, Value>;

-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-
 TEST(CuTe_core_msvc_compilation, TupleAssignment)
 {
  CUTLASS_TRACE_HOST("-------------------------------");
@ -64,30 +62,10 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
  using forty_two_type = IC<int, 42>;
  using forty_three_type = IC<size_t, 43>;

-  using ebo_s_type = cute::detail::EBO<0, forty_two_type>;
-  [[maybe_unused]] ebo_s_type ebo_s;
-  static_assert(std::is_same_v<decltype(cute::detail::getv(ebo_s)), forty_two_type>);
-
-  using ebo_d_type = cute::detail::EBO<1, size_t>;
-  [[maybe_unused]] ebo_d_type ebo_d(43u);
-  assert(ebo_d.t_ == 43u);
-  static_assert(std::is_same_v<std::remove_const_t<std::remove_reference_t<decltype(cute::detail::getv(ebo_d))>>, size_t > );
-  assert(cute::detail::getv(ebo_d) == 43u);
-
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb0{
-          41, forty_two_type{}, size_t(43u) };
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb1;
-
  int val41 = ConvertibleTo{41};
  assert(val41 == 41);
  size_t val43 = ConvertibleTo{size_t(43u)};
  assert(val43 == size_t{43u});
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb2{
-        ConvertibleTo{41}, forty_two_type{}, ConvertibleTo{size_t(43u)}};
-
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3{ 41 };
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3a{ 42 };
-  tb3 = tb3a;

  using tuple_0d_type = cute::tuple<>;
  using tuple_1d_d_type = cute::tuple<int>;
@ -106,7 +84,6 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
  // 'TupleBase<int, unsigned __int64>' is not a base or member"
  t3 = t3a;
 }
-#endif // CUTLASS_USE_PACKED_TUPLE

 TEST(CuTe_core_msvc_compilation, TupleGetSingleInteger)
 {
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@ -51,12 +51,8 @@ endfunction()

 ################################################################################

-
 add_subdirectory(sm100_blockscaled_tensorop_gemm)
 add_subdirectory(sm100_tensorop_gemm)
-
-
-
 cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_simt

@ -833,7 +829,7 @@ endif()



-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)

 cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@ -2204,7 +2204,7 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
  
  static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
  alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
-  // For fp4 and fp6 QMMA kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
+  // For fp4 and fp6 kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
  int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);

  
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
@ -30,7 +30,7 @@

 #

-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
  cutlass_test_unit_gemm_device_sm100_blockscaled
  DEPENDS
@ -57,7 +57,7 @@ cutlass_test_unit_gemm_device_add_executable(
  nvf4_nvf4_f16_nvfp4_epilogue.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4

  BATCH_SOURCES ON
@ -67,7 +67,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf4_mxf4_void_f16_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6

  BATCH_SOURCES ON
@ -77,7 +77,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf6_mxf6_void_bf16_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8

  BATCH_SOURCES ON
@ -87,7 +87,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf8_mxf8_void_f8_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8

  BATCH_SOURCES ON
@ -97,7 +97,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf6_mxf8_void_f32_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6

  BATCH_SOURCES ON
@ -107,7 +107,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf8_mxf6_f16_f8_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8

  BATCH_SOURCES ON
@ -117,7 +117,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf4_mxf8_bf16_bf16_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4

  BATCH_SOURCES ON
@ -127,7 +127,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf8_mxf4_f16_bf16_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4

  BATCH_SOURCES ON
@ -137,7 +137,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  mxf6_mxf4_f16_f16_nt_layout.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6

  BATCH_SOURCES ON
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -254,8 +250,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -263,12 +257,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
@ -109,8 +109,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -118,7 +116,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_192,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -337,12 +329,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_192,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;

  //
  // Construct CollectiveEpilogue
@ -410,7 +400,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,7 +401,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -180,8 +178,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -189,7 +185,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -259,8 +255,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -268,7 +262,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -334,8 +328,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -343,7 +335,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -413,8 +405,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_192,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;

  //
  // Construct CollectiveEpilogue
@ -422,12 +412,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -495,12 +483,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_192,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;

  //
  // Construct CollectiveEpilogue
@ -570,12 +556,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -634,8 +620,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_256,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;

  //
  // Construct CollectiveEpilogue
@ -643,12 +627,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
@ -92,8 +92,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -101,12 +99,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -165,8 +163,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct CollectiveEpilogue
@ -174,12 +170,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -244,8 +240,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -256,7 +250,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -321,8 +315,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -333,7 +325,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
@ -104,8 +104,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  // Mma's accumulator type
  using ElementAccumulator = float;
@ -130,12 +128,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -195,8 +193,6 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
  using MmaTileShape_MNK = Shape<_256,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  // Mma's accumulator type
  using ElementAccumulator = float;
@ -220,12 +216,12 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
  //
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
    >::CollectiveOp;

  //
@ -289,8 +285,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  //
  // Construct FusionOperation
@ -310,7 +304,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -382,8 +376,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens
  using MmaTileShape_MNK = Shape<_128,_128,_256>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;

  // Mma's accumulator type
  using ElementAccumulator = float;
@ -399,12 +391,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
-      PerSmTileShape_MNK, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC, AlignC,
      ElementD, GmemLayoutC, AlignD,
-      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
      FusionOperation
    >::CollectiveOp;

--- a/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_4,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_128,_64>;
+using ClusterShape = Shape<_4,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_4,_4,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_4,_4,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -263,16 +254,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -321,17 +309,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -344,7 +329,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -382,17 +367,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -405,7 +387,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -443,17 +425,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -466,7 +445,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -504,17 +483,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -527,7 +503,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -565,16 +541,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
@ -79,17 +79,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -139,17 +136,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -200,17 +194,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -261,17 +252,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -322,16 +310,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -380,17 +365,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -441,17 +423,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -502,17 +481,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -563,17 +539,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -624,16 +597,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
@ -79,17 +79,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -139,17 +136,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -200,17 +194,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -261,17 +252,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -322,16 +310,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -380,17 +365,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -441,17 +423,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -502,17 +481,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -563,17 +539,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -624,16 +597,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -263,16 +254,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -322,17 +310,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -383,17 +368,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -444,17 +426,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -505,17 +484,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -566,16 +542,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC *, AlignmentC,
@ -587,7 +560,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA *, AlignmentA,
    ElementB, LayoutB *, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -383,17 +368,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -444,17 +426,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -505,17 +484,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -566,17 +542,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -589,7 +562,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
@ -627,16 +600,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ElementAccumulator, ElementAccumulator,
    ElementC, LayoutC, AlignmentC,
@ -648,7 +618,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
    ElementA, LayoutA, AlignmentA,
    ElementB, LayoutB, AlignmentB,
    ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
    cutlass::gemm::collective::StageCountAutoCarveout<
      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
    KernelSchedule
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
  using ClusterShape = Shape<_4,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -126,17 +123,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 4,
@ -179,17 +173,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 4,
@ -232,17 +223,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 4,
@ -285,17 +273,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 4,
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using MmaTileShape = Shape<_128,_128,_256>;
  using ClusterShape = Shape<_4,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
--- a/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_128,_128,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
  using ClusterShape = Shape<_2,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
  using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -338,17 +323,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_512,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -391,17 +373,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
@ -444,17 +423,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256
  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;

-  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using MmaTileShape = cute::Shape<_256,_256,_256>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 4,
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
@ -69,16 +69,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -118,16 +115,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -167,16 +161,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -216,16 +207,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -265,16 +253,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -314,16 +299,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -363,16 +345,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -412,16 +391,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC *, 16 / sizeof(ElementC),
@ -461,16 +437,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using ElementD = cutlass::float_e4m3_t;
  using ElementAccumulator = float;
  using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      void, LayoutD *, 16 / sizeof(ElementD),
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
@ -74,16 +74,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -124,16 +121,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -174,16 +168,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -224,16 +215,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -274,16 +262,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -324,16 +309,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -374,16 +356,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -424,16 +403,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128
  using ElementAccumulator = float;
  using ElementCompute = float;
  using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
--- a/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
@ -71,17 +71,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1
  using ElementAccumulator = int32_t;
  using ElementCompute = float;
  using ElementBias = int8_t;
-  using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -126,18 +122,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x
  using ElementAccumulator = int32_t;
  using ElementCompute = float;
  using ElementBias = int8_t;
-  using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_4,_2,_1>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -183,17 +174,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x
  using ElementAccumulator = int32_t;
  using ElementCompute = int32_t;
  using ElementBias = int32_t;
-  using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
@ -239,17 +226,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4
  using ElementAccumulator = int32_t;
  using ElementCompute = float;
  using ElementBias = int8_t;
-  using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = Shape<_128,_256,Int<128 / sizeof(ElementA)>>;
  using ClusterShape = Shape<_2,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));

  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, LayoutC, 16 / sizeof(ElementC),
--- a/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
@ -71,10 +71,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -82,7 +80,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC, 4,
@ -126,10 +124,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -137,7 +133,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC, 4,
@ -181,10 +177,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -192,7 +186,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC, 4,
@ -236,10 +230,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -247,7 +239,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC, 4,
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
  using GmemLayoutA = cutlass::layout::RowMajor;
  using GmemLayoutB = cutlass::layout::RowMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
  using GmemLayoutA = cutlass::layout::ColumnMajor;
  using GmemLayoutB = cutlass::layout::ColumnMajor;
  using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
  
  //
  // Construct CollectiveEpilogue
@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
      cutlass::epilogue::collective::EpilogueTileAuto,
      ElementAccumulator, ElementCompute,
      ElementC, GmemLayoutC *, 16,
--- a/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
@ -29,7 +29,7 @@
 #

 #
-
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
  cutlass_test_unit_gemm_device_sm100_tensorop
  DEPENDS
@ -38,7 +38,7 @@ add_custom_target(
  cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_f16xf16

  BATCH_SOURCES ON
@ -48,7 +48,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  f16_f16_f16_f16_fusion.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_f8xf8

  BATCH_SOURCES ON
@ -58,7 +58,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  f8_f8_f16_f8_fusion.cu
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8

  BATCH_SOURCES ON
@ -67,5 +67,6 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  s8_s8_void_s32.cu
  s8_s8_s32_s32_fusion.cu
 )
+endif()

 add_subdirectory(narrow_precision) 
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu
@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * acc + beta * C + per-row bias
@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * acc + beta * C + per-row bias
@ -193,7 +189,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -264,8 +260,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * acc + beta * C + per-row bias
@ -290,7 +284,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -355,8 +349,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * acc + beta * C + per-row bias
@ -380,7 +372,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -451,8 +443,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // dY = alpha * acc + beta * C
@ -476,7 +466,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -541,8 +531,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // dY = alpha * acc + beta * C
@ -566,7 +554,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu
@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm
  using MmaTileShape_MNK = Shape<_64,_64,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_64>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s
  using MmaTileShape_MNK = Shape<_64,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_64>;

  //
  // Construct CollectiveEpilogue
@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st
  using MmaTileShape_MNK = Shape<_128,_64,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_64>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  //
  // Construct CollectiveEpilogue
@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s
  using MmaTileShape_MNK = Shape<_128,_64,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_64>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_64>;

  //
  // Construct CollectiveEpilogue
@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_

    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {
  using MmaTileShape_MNK = Shape<_256,_64,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_64>;

  //
  // Construct CollectiveEpilogue
@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)
  using MmaTileShape_MNK = Shape<_256,_256,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_64>;

  //
  // Construct CollectiveEpilogue
@ -616,7 +600,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -194,7 +190,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -265,8 +261,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -294,7 +288,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -359,8 +353,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
  using MmaTileShape_MNK = Shape<_128,_128,_64>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;

  // Epilogue fusion operation
  // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@ -388,7 +380,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu
@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
  using MmaTileShape_MNK = Shape<_64,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -91,7 +89,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -155,8 +153,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -167,7 +163,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -241,7 +235,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -305,8 +299,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -317,7 +309,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -392,7 +382,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -457,8 +447,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -469,7 +457,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1

    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -619,7 +603,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
@ -29,7 +29,7 @@
 #

 #
-
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
  cutlass_test_unit_gemm_device_sm100_tensorop_narrow_precision
  DEPENDS
@ -38,7 +38,7 @@ add_custom_target(
  cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4
 )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf6f4

  BATCH_SOURCES ON
@ -50,7 +50,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  f6f4_f6f4_void_f32_tt_layout.cu
  )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf8

  BATCH_SOURCES ON
@ -60,7 +60,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  f6f4_f8_void_f32_nt_layout.cu
  )

-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
  cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4

  BATCH_SOURCES ON
@ -69,3 +69,4 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
  f8_f6f4_void_f32_tn_layout.cu
  f8_f6f4_void_f32_nt_layout.cu
  )
+endif()
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu
@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -274,7 +268,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -339,8 +333,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -348,7 +340,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -424,7 +414,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -489,8 +479,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -498,7 +486,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -574,7 +560,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -639,8 +625,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -648,7 +632,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu
@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu
@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
  using MmaTileShape_MNK = Shape<_64,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
  using MmaTileShape_MNK = Shape<_64,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
  using MmaTileShape_MNK = Shape<_64,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -344,7 +336,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -408,8 +400,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -420,7 +410,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -494,7 +482,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -558,8 +546,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -570,7 +556,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -644,7 +628,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -708,8 +692,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -720,7 +702,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -794,7 +774,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -858,8 +838,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -870,7 +848,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -944,7 +920,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1008,8 +984,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1020,7 +994,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1170,7 +1140,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1235,8 +1205,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -1244,7 +1212,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu
@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
  using MmaTileShape_MNK = Shape<_64,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu
@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -573,7 +559,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -638,8 +624,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu
@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
  using MmaTileShape_MNK = Shape<_64,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -263,8 +259,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
  using MmaTileShape_MNK = Shape<_64,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -272,7 +266,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -336,8 +330,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
  using MmaTileShape_MNK = Shape<_64,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -345,7 +337,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -409,8 +401,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -495,7 +483,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -559,8 +547,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -636,8 +622,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -645,7 +629,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -709,8 +693,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -786,8 +768,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -795,7 +775,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -859,8 +839,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -868,7 +846,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -932,8 +910,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -941,7 +917,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1005,8 +981,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1017,7 +991,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1082,8 +1056,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -1091,7 +1063,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1155,8 +1127,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1232,8 +1202,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -1241,7 +1209,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu
@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
  using MmaTileShape_MNK = Shape<_64,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu
@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
  using MmaTileShape_MNK = Shape<_64,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
  using MmaTileShape_MNK = Shape<_64,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
  using MmaTileShape_MNK = Shape<_64,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -712,8 +696,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -797,7 +777,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -862,8 +842,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
  using MmaTileShape_MNK = Shape<_128,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -871,7 +849,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
  using MmaTileShape_MNK = Shape<_128,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -947,7 +923,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1012,8 +988,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  //
  // Construct CollectiveEpilogue
@ -1021,7 +995,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
  using MmaTileShape_MNK = Shape<_256,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
  using MmaTileShape_MNK = Shape<_256,_192,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;

  //
  // Construct CollectiveEpilogue
@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -1231,8 +1201,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -1243,7 +1211,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu
@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Epilogue fusion operation
  // Z = per-row alpha * acc + per-row beta * C + per-row bias
@ -101,7 +99,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
  //
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -166,8 +164,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  // Epilogue fusion operation
  // Z = per-col alpha * acc + per-col beta * C + per-col bias
@ -185,7 +181,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
  //
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu
@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_
  using MmaTileShape_MNK = Shape<_64,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm
  using MmaTileShape_MNK = Shape<_64,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm
  using MmaTileShape_MNK = Shape<_128,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2
  using MmaTileShape_MNK = Shape<_128,_128,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;

  //
  // Construct CollectiveEpilogue
@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2

    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str
  using MmaTileShape_MNK = Shape<_256,_64,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;

  // Tile Scheduler
  using TileScheduler = cutlass::gemm::StreamKScheduler;
@ -546,7 +532,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@ -611,8 +597,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {
  using MmaTileShape_MNK = Shape<_256,_256,_128>;
  // Cluster size for multicast
  using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;

  //
  // Construct CollectiveEpilogue
@ -620,7 +604,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {

  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
--- a/test/unit/pipeline/CMakeLists.txt
+++ b/test/unit/pipeline/CMakeLists.txt
@ -26,12 +26,21 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-cutlass_test_unit_add_executable(
-  cutlass_test_unit_pipeline
+set(PIPELINE_SOURCES
  pipeline_tma_async.cu
  pipeline_tma_async_warp_specialized.cu
  pipeline_tma_async_warp_specialized_persistent.cu
-  pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu 
  pipeline_async.cu
  sequence_barrier.cu
 )
+
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+  list(APPEND PIPELINE_SOURCES 
+    pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
+  )
+endif()
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_pipeline
+  ${PIPELINE_SOURCES}
+)